In [28]:
from scipy.optimize import differential_evolution
from params import StrategyParams, Params
from strategy import LearningStrategy, RandomStrategy
from game import Game

In [29]:
p = Params(num_epochs=50000,
               explain=False)

In [30]:
def func(x):
    sp_learning_x = StrategyParams(start_q=x[0],
                                   learning_rate=x[1],
                                   discount_rate=x[2],
                                   epsilon_decay_rate=x[3],
                                   minimum_epsilon=x[4],
                                   next_state_is_predictable=False,
                                   predictive=True,
                                   learning=True,
                                   keep_log=False)
    
    sp_learning_o = StrategyParams(start_q=-x[0],
                                   learning_rate=x[1],
                                   discount_rate=x[2],
                                   epsilon_decay_rate=x[3],
                                   minimum_epsilon=x[4],
                                   next_state_is_predictable=False,
                                   predictive=True,
                                   learning=True,
                                   keep_log=False)
    
    print(sp_learning_x)
    
    strategy_x = LearningStrategy(sp_learning_x)
    strategy_o = LearningStrategy(sp_learning_o)
    strategies = [strategy_x, strategy_o]
    
    for epoch in range(p.num_epochs):
        game = Game(2, strategies)
        game.play(explain=p.explain)
    
    
    N = 5000
    x_win = 0
    o_win = 0
    draw = 0
    strategy_random = RandomStrategy()
    for _ in range(N):
        game = Game(2, [strategy_random, strategy_o])
        game.play(narrate=False, explain=False)
        if game.data.winner == "x":
            x_win += 1
        elif game.data.winner == "o":
            o_win += 1
        else:
            draw += 1
    
    print("x wins: {}, o wins: {}, draws: {}".format(x_win, o_win, draw), "\n")
    
    return o_win + 0.5*draw

In [31]:
bounds = [(0, 1), (0, 0.5), (0, 1), (0.5, 1), (0, 0.5)]

In [32]:
differential_evolution(func, bounds)

StrategyParams(start_q=0.49753460130624527, learning_rate=0.34428887946955095, discount_rate=0.15132724725279734, epsilon_decay_rate=0.888637553463036, minimum_epsilon=0.49868388208264564, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 1642, o wins: 2693, draws: 665 

StrategyParams(start_q=0.5477322595810676, learning_rate=0.281961118234926, discount_rate=0.19643726432358227, epsilon_decay_rate=0.7348880506801824, minimum_epsilon=0.2791927845236568, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 1024, o wins: 2848, draws: 1128 

StrategyParams(start_q=0.4043769613960632, learning_rate=0.20702282135810043, discount_rate=0.9134123978740905, epsilon_decay_rate=0.6623711942012797, minimum_epsilon=0.1342637933442684, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 783, o wins: 3230, draws: 987 

StrategyParams(start_q=0.8693599701904586, learning_rate=0.4001738489338

x wins: 1399, o wins: 2869, draws: 732 

StrategyParams(start_q=0.33268868847109245, learning_rate=0.17789011375328445, discount_rate=0.10320771943647872, epsilon_decay_rate=0.6704283660758376, minimum_epsilon=0.334701460013494, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 1257, o wins: 2567, draws: 1176 

StrategyParams(start_q=0.9751869922581681, learning_rate=0.1494385744984799, discount_rate=0.29869264505204335, epsilon_decay_rate=0.6982673421449763, minimum_epsilon=0.18675256797692252, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 1076, o wins: 2929, draws: 995 

StrategyParams(start_q=0.023206936770249376, learning_rate=0.2914924673779265, discount_rate=0.7323231565995719, epsilon_decay_rate=0.8132246399800696, minimum_epsilon=0.42693428132682837, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 1499, o wins: 2713, draws: 788 

StrategyParams(start_q=0.08

x wins: 302, o wins: 3399, draws: 1299 

StrategyParams(start_q=0.9598772672349617, learning_rate=0.49430440701264083, discount_rate=0.39138818262821096, epsilon_decay_rate=0.8947487542551187, minimum_epsilon=0.18006063777763753, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 907, o wins: 3182, draws: 911 

StrategyParams(start_q=0.30685195579957075, learning_rate=0.42469966580640484, discount_rate=0.7740497178618536, epsilon_decay_rate=0.9748654764131459, minimum_epsilon=0.2230571956291939, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 1669, o wins: 2440, draws: 891 

StrategyParams(start_q=0.8985178614991968, learning_rate=0.1924632320509186, discount_rate=0.6016909251047166, epsilon_decay_rate=0.8324984701938886, minimum_epsilon=0.4683337130844479, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 1573, o wins: 2561, draws: 866 

StrategyParams(start_q=0.829876

x wins: 1085, o wins: 3187, draws: 728 

StrategyParams(start_q=0.4153323503988474, learning_rate=0.007627052842502979, discount_rate=0.7952051087900546, epsilon_decay_rate=0.630177043464696, minimum_epsilon=0.28235597814859537, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 1077, o wins: 2768, draws: 1155 

StrategyParams(start_q=0.22523603657828695, learning_rate=0.06894125850511396, discount_rate=0.9146263073123686, epsilon_decay_rate=0.6838083707853873, minimum_epsilon=0.37200741190008957, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 1314, o wins: 2730, draws: 956 

StrategyParams(start_q=0.8524783029347915, learning_rate=0.19868332594205682, discount_rate=0.5270516388729927, epsilon_decay_rate=0.9075767811723763, minimum_epsilon=0.27107062628865297, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 1071, o wins: 2931, draws: 998 

StrategyParams(start_q=0.80

x wins: 1571, o wins: 2650, draws: 779 

StrategyParams(start_q=0.8838655880577952, learning_rate=0.403314368339806, discount_rate=0.8163190597889599, epsilon_decay_rate=0.938293839668989, minimum_epsilon=0.4383497876134304, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 1626, o wins: 2652, draws: 722 

StrategyParams(start_q=0.7020984142238889, learning_rate=0.44259088760016563, discount_rate=0.9118270707222971, epsilon_decay_rate=0.963714748553014, minimum_epsilon=0.4794676418704467, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 1929, o wins: 2338, draws: 733 

StrategyParams(start_q=0.6349831693097837, learning_rate=0.007487454365742718, discount_rate=0.4722959460303763, epsilon_decay_rate=0.7532697725262105, minimum_epsilon=0.10805361252559814, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 1004, o wins: 2673, draws: 1323 

StrategyParams(start_q=0.14120747

x wins: 1015, o wins: 3253, draws: 732 

StrategyParams(start_q=0.3952785913700537, learning_rate=0.3252403612451459, discount_rate=0.025725161686732334, epsilon_decay_rate=0.976922392230924, minimum_epsilon=0.39660806242419505, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 1563, o wins: 2583, draws: 854 

StrategyParams(start_q=0.45691940381499496, learning_rate=0.4599043943120277, discount_rate=0.5287351089547939, epsilon_decay_rate=0.679795200231361, minimum_epsilon=0.24285548790269343, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 961, o wins: 3035, draws: 1004 

StrategyParams(start_q=0.4622815389126646, learning_rate=0.012704184268410107, discount_rate=0.9065558909443605, epsilon_decay_rate=0.7947613335102335, minimum_epsilon=0.36342231380709455, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 1260, o wins: 2759, draws: 981 

StrategyParams(start_q=0.7651

KeyboardInterrupt: 

# Best Params:
   
StrategyParams(start_q=0.23082752666338058, learning_rate=0.09245073949115773, discount_rate=0.3608335241270741, epsilon_decay_rate=0.7160240998016927, minimum_epsilon=0.026094956680744186, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 479, o wins: 3132, draws: 1389 


StrategyParams(start_q=0.10171475550358278, learning_rate=0.16132763314178372, discount_rate=0.2283086928006433, epsilon_decay_rate=0.7953901964762325, minimum_epsilon=0.09772520369426896, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 492, o wins: 3366, draws: 1142 

StrategyParams(start_q=0.7724682885195481, learning_rate=0.4735487280733173, discount_rate=0.2890937754834353, epsilon_decay_rate=0.6800479925044572, minimum_epsilon=0.057665436642734075, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 464, o wins: 3781, draws: 755 

StrategyParams(start_q=0.37032901152117165, learning_rate=0.063425400230055, discount_rate=0.33726241818394087, epsilon_decay_rate=0.6277552071487283, minimum_epsilon=0.06019959209146125, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 464, o wins: 3184, draws: 1352 

StrategyParams(start_q=0.25651026340684513, learning_rate=0.31960716603717787, discount_rate=0.12519597798041465, epsilon_decay_rate=0.505301670253032, minimum_epsilon=0.06871052039780917, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 302, o wins: 3399, draws: 1299 

StrategyParams(start_q=0.043143145008168404, learning_rate=0.02998652759047299, discount_rate=0.4923771021026557, epsilon_decay_rate=0.6482782362134453, minimum_epsilon=0.07598804308383217, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 309, o wins: 3788, draws: 903 

StrategyParams(start_q=0.611717749204169, learning_rate=0.3866630473229207, discount_rate=0.046976083536071744, epsilon_decay_rate=0.642567554741782, minimum_epsilon=0.010835465079376577, next_state_is_predictable=False, predictive=True, learning=True, keep_log=False)
x wins: 406, o wins: 3646, draws: 948

In [None]:
sp_learning_x = StrategyParams(start_q=0.3,
                               learning_rate=0.1,
                               discount_rate=0.7,
                               epsilon_decay_rate=0.0,
                               minimum_epsilon=0.4,
                               next_state_is_predictable=False,
                               predictive=True,
                               learning=True,
                               keep_log=True)