# Setup for Performance Test

## Path Configuration

In [1]:
import os
import sys
import argparse

root = "../"*3
src_path = os.path.join(root, "kyoka")
sample_path = os.path.join(root, "sample")
sys.path.append(root)
sys.path.append(src_path)
sys.path.append(sample_path)

import logging as log
log.basicConfig(format='[%(levelname)s] %(message)s', level=log.INFO)

from kyoka.algorithm.montecarlo.montecarlo import MonteCarlo
from kyoka.algorithm.td_learning.sarsa import Sarsa
from kyoka.algorithm.td_learning.q_learning import QLearning
from kyoka.algorithm.td_learning.sarsa_lambda import SarsaLambda
from kyoka.algorithm.td_learning.q_lambda import QLambda

from kyoka.policy.greedy_policy import GreedyPolicy
from kyoka.policy.epsilon_greedy_policy import EpsilonGreedyPolicy
from kyoka.finish_rule.watch_iteration_count import WatchIterationCount

from sample.ticktacktoe.ticktacktoe_domain import TickTackToeDomain
from sample.ticktacktoe.ticktacktoe_keras_value_function import TickTackToeKerasValueFunction
from sample.ticktacktoe.ticktacktoe_helper import TickTackToeHelper
from sample.ticktacktoe.ticktacktoe_manual_policy import TickTackToeManualPolicy
from sample.ticktacktoe.ticktacktoe_perfect_policy import TickTackToePerfectPolicy
from sample.ticktacktoe.ticktacktoe_performance_logger import TickTackToePerformanceLogger

Using Theano backend.


## Define Const for Performance Test

In [2]:
LOG_INTERVAL = 1000
TEST_GAME_COUNT = 10
TEST_INTERVAL = 50000
IS_FIRST_PLAYER = True

domain = TickTackToeDomain(is_first_player=IS_FIRST_PLAYER)

## Setup Global Item for Performance Test

In [3]:
def gen_performance_logger():
    callback = TickTackToePerformanceLogger()
    callback.set_performance_test_interval(TEST_INTERVAL)
    callback.set_is_first_player(IS_FIRST_PLAYER)
    callback.set_test_game_count(TEST_GAME_COUNT)
    return callback

def run_performance_test(rl_algo, epsilon, test_length):
    watch_iteration = WatchIterationCount(target_count=test_length, log_interval=LOG_INTERVAL)
    finish_rules = [watch_iteration]
    value_func = TickTackToeKerasValueFunction()
    value_func.setUp()
    policy = EpsilonGreedyPolicy(eps=epsilon)
    callback = gen_performance_logger()
    rl_algo.set_gpi_callback(callback)
    rl_algo.GPI(domain, policy, value_func, finish_rules)
    return callback.game_log

In [4]:
%matplotlib inline
import seaborn
import matplotlib.pyplot as plt

def visualize_test_result(title, performance_test_result):
    labels = [TEST_INTERVAL * (i+1) for i in range(len(performance_test_result))]
    lose_log, draw_log, win_log = [[log[1][i] for log in performance_test_result] for i in range(3)]

    plt.figure(figsize=(10,5))
    plt.plot(labels, lose_log, label="lose rate")
    plt.plot(labels, draw_log, label="draw rate")
    plt.plot(labels, win_log, label="win rate")

    plt.xlabel("GPI iteration")
    plt.ylabel("rate(%)")
    plt.title(title)
    plt.legend(loc = 1)

    plt.show()    

In [5]:
ACTION_NAME_MAP = {
    1 : "lower_right",
    2 : "lower_center",
    4 : "lower_left",
    8 : "middle_right",
    16: "middle_center",
    32: "middle_left",
    64: "upper_right",
    128: "upper_center",
    256: "upper_left"
}

def gen_test_case():
    bin2i = lambda b: int(b, 2)
    
    """
    - - -
    - - -
    - - -
    """
    first_player_board = bin2i("000000000")
    second_player_board = bin2i("000000000")
    case1 = (first_player_board, second_player_board)
    answer1 = {
        'upper_right': 0, 'lower_left': 0, 'upper_left': 0, 'lower_right': 0, 'lower_center': 0,
        'middle_right': 0, 'middle_left': 0, 'upper_center': 0, 'middle_center': 0
    }
    
    """
    O O -
    - - -
    - X X
    """
    first_player_board = bin2i("110000000")
    second_player_board = bin2i("000000011")
    case2 = (first_player_board, second_player_board)
    answer2 = {'middle_right': -1, 'upper_right': 1, 'middle_left': -1, 'lower_left': 1, 'middle_center': -1}
    
    """
    - - O
    - - X
    - - -
    """
    first_player_board = bin2i("0000010000")
    second_player_board = bin2i("000001000")
    case3 = (first_player_board, second_player_board)
    answer3 = {'upper_right': 1, 'lower_left': 1, 'upper_left': 1, 'lower_right': 1, 'lower_center': 1, 'middle_left': 0, 'upper_center': 1}
    
    """
    - - -
    - 0 X
    - - -
    """
    first_player_board = bin2i("010000000")
    second_player_board = bin2i("000001000")
    case4 = (first_player_board, second_player_board)
    answer4 = {'upper_right': 1, 'lower_left': 0, 'upper_left': -1, 'lower_right': 0, 'lower_center': -1, 'middle_left': 0, 'middle_center': 1}
    
    """
    - O -
    - - X
    - - -
    """
    first_player_board = bin2i("001000000")
    second_player_board = bin2i("000001000")
    case5 = (first_player_board, second_player_board)
    answer5 = {'lower_left': 0, 'upper_left': 1, 'lower_right': 0, 'lower_center': 0, 'middle_left': 0, 'upper_center': 1, 'middle_center': 1}
    
    return zip([case1, case2, case3, case4, case5], [answer1, answer2, answer3, answer4, answer5])

def visualize_policy(model_weights_path):
    domain = TickTackToeDomain()
    value_func = TickTackToeKerasValueFunction()
    value_func.setUp()
    value_func.load_model_weights(model_weights_path)
    for state, answer in gen_test_case():
        print TickTackToeHelper.visualize_board(state)
        actions = [ACTION_NAME_MAP[action] for action in domain.generate_possible_actions(state)]
        values = [value_func.calculate_value(state, action) for action in domain.generate_possible_actions(state)]
        policy = {action:value for action, value in zip(actions, values)}
        print "answer => %s" % sorted(answer.items(), key=lambda item: item[1])[::-1]
        print "policy => %s" % sorted(policy.items(), key=lambda item: item[1])[::-1]
        print

# Sarsa

In [None]:
start_time = time.time()
TEST_LENGTH = 5000000
EPSILON = 0.7
performance_test_result = run_performance_test(Sarsa(alpha=0.1, gamma=0.7), epsilon=EPSILON, test_length=TEST_LENGTH)
visualize_test_result("Sarsa", performance_test_result)

<img src="./resource/5million_gpi/Sarsa_eps_0.700000_test_result.png" />

In [6]:
visualize_policy("./resource/5million_gpi/Sarsa_0.700000.h5")

- - -
- - -
- - -
answer => [('lower_center', 0), ('middle_center', 0), ('lower_right', 0), ('upper_left', 0), ('upper_center', 0), ('lower_left', 0), ('middle_left', 0), ('middle_right', 0), ('upper_right', 0)]
policy => [('middle_center', 0.16172975301742554), ('lower_left', 0.157234787940979), ('upper_left', 0.15287885069847107), ('upper_right', 0.14654052257537842), ('lower_right', 0.13548466563224792), ('middle_right', 0.12535545229911804), ('middle_left', 0.11796092987060547), ('lower_center', 0.11647436022758484), ('upper_center', 0.10265439748764038)]

O O -
- - -
- X X
answer => [('lower_left', 1), ('upper_right', 1), ('middle_center', -1), ('middle_left', -1), ('middle_right', -1)]
policy => [('upper_right', 0.94527852535247803), ('lower_left', 0.42774173617362976), ('middle_left', 0.38773348927497864), ('middle_right', 0.26842990517616272), ('middle_center', 0.10787613689899445)]

- - -
- O X
- - -
answer => [('upper_center', 1), ('lower_center', 1), ('lower_right', 1), ('up

# QLearning

In [None]:
TEST_LENGTH = 5000000
EPSILON = 0.7
performance_test_result = run_performance_test(QLearning(alpha=0.1, gamma=0.7), epsilon=EPSILON, test_length=TEST_LENGTH)
visualize_test_result("QLearning", performance_test_result)

<img src="./resource/5million_gpi/QLearning_eps_0.700000_test_result.png" />

In [9]:
visualize_policy("./resource/5million_gpi/QLearning_0.700000.h5")

- - -
- - -
- - -
answer => [('lower_center', 0), ('middle_center', 0), ('lower_right', 0), ('upper_left', 0), ('upper_center', 0), ('lower_left', 0), ('middle_left', 0), ('middle_right', 0), ('upper_right', 0)]
policy => [('lower_left', 0.29407450556755066), ('upper_left', 0.28823432326316833), ('upper_right', 0.2830960750579834), ('middle_left', 0.28111934661865234), ('middle_right', 0.28003492951393127), ('lower_right', 0.27665066719055176), ('upper_center', 0.26782119274139404), ('lower_center', 0.25591105222702026), ('middle_center', 0.25582700967788696)]

O O -
- - -
- X X
answer => [('lower_left', 1), ('upper_right', 1), ('middle_center', -1), ('middle_left', -1), ('middle_right', -1)]
policy => [('upper_right', 1.0543733835220337), ('lower_left', 0.55598914623260498), ('middle_left', 0.47618782520294189), ('middle_right', 0.46231412887573242), ('middle_center', 0.41034692525863647)]

- - -
- O X
- - -
answer => [('upper_center', 1), ('lower_center', 1), ('lower_right', 1), ('up

# SarsaLambda

In [None]:
performance_test_result = run_performance_test(SarsaLambda())
visualize_test_result(performance_test_result)

# QLambda

In [None]:
TEST_LENGTH = 5000000
EPSILON = 0.3
performance_test_result = run_performance_test(QLambda(alpha=0.1, gamma=0.7), epsilon=EPSILON, test_length=TEST_LENGTH)
visualize_test_result("QLambda", performance_test_result)

<img src="./resource/5million_gpi/QLambda_eps_0.300000_test_result.png" />

In [8]:
visualize_policy("./resource/5million_gpi/QLambda_0.300000.h5")

- - -
- - -
- - -
answer => [('lower_center', 0), ('middle_center', 0), ('lower_right', 0), ('upper_left', 0), ('upper_center', 0), ('lower_left', 0), ('middle_left', 0), ('middle_right', 0), ('upper_right', 0)]
policy => [('lower_center', 0.22246670722961426), ('lower_right', 0.22246667742729187), ('middle_center', 0.22246666252613068), ('upper_center', 0.22246666252613068), ('upper_left', 0.22246666252613068), ('upper_right', 0.22246666252613068), ('middle_left', 0.22246661782264709), ('middle_right', 0.2224666029214859), ('lower_left', 0.22246649861335754)]

O O -
- - -
- X X
answer => [('lower_left', 1), ('upper_right', 1), ('middle_center', -1), ('middle_left', -1), ('middle_right', -1)]
policy => [('upper_right', 0.97080790996551514), ('middle_left', 0.494884192943573), ('lower_left', 0.37983113527297974), ('middle_right', 0.37916263937950134), ('middle_center', 0.24135981500148773)]

- - -
- O X
- - -
answer => [('upper_center', 1), ('lower_center', 1), ('lower_right', 1), ('upp