In [1]:
# imports

import asyncio
import os
import re
from datetime import date

import json
import matplotlib
import neptune.new as neptune
import nest_asyncio
import numpy as np
import pandas as pd
import time

from collections import defaultdict
from itertools import product
from matplotlib import pyplot
from poke_env.environment.abstract_battle import AbstractBattle
from poke_env.player.battle_order import ForfeitBattleOrder
from poke_env.player.player import Player
# from poke_env.player.random_player import RandomPlayer
from scipy.interpolate import griddata
from src.playerMC import Player as PlayerSarsa

In [2]:
# global configs

debug = True
save_to_json_file = True
use_validation = False
use_neptune = False

nest_asyncio.apply()
np.random.seed(0)

use_neptune = True
if use_neptune:
    run = neptune.init(project='leolellisr/rl-pokeenv',
                       api_token='eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI1NjY1YmJkZi1hYmM5LTQ3M2QtOGU1ZC1iZTFlNWY4NjE1NDQifQ==',
                       name= 'SarsaDeterministic', tags=['Bruno', 'Sarsa', 'Deterministic', 'Train'])

https://app.neptune.ai/leolellisr/rl-pokeenv/e/RLPOK-123
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


In [3]:
# our team

OUR_TEAM = """
Turtonator @ White Herb  
Ability: Shell Armor  
EVs: 4 Atk / 252 SpA / 252 Spe  
Rash Nature  
- Flamethrower  
- Dragon Pulse  
- Earthquake  
- Shell Smash  

Lapras @ Leftovers  
Ability: Shell Armor  
EVs: 252 HP / 252 SpA / 4 SpD  
Modest Nature  
IVs: 0 Atk  
- Freeze-Dry  
- Surf  
- Thunderbolt  
- Toxic  

Armaldo @ Assault Vest  
Ability: Battle Armor  
EVs: 252 HP / 252 Atk / 4 SpD  
Adamant Nature  
- Earthquake  
- Knock Off  
- X-Scissor  
- Aqua Jet  

Drapion @ Life Orb  
Ability: Battle Armor  
EVs: 252 Atk / 4 SpD / 252 Spe  
Jolly Nature  
- Poison Jab  
- Knock Off  
- Earthquake  
- X-Scissor  

Kabutops @ Aguav Berry  
Ability: Battle Armor  
EVs: 252 Atk / 4 SpD / 252 Spe  
Jolly Nature  
- Liquidation  
- Leech Life  
- Knock Off  
- Swords Dance  

Falinks @ Iapapa Berry  
Ability: Battle Armor  
EVs: 252 HP / 252 Atk / 4 SpD  
Adamant Nature  
- Close Combat  
- Poison Jab  
- Iron Head  
- No Retreat  

"""


In [4]:
# opponent's team

OP_TEAM = """
Cloyster @ Assault Vest  
Ability: Shell Armor  
EVs: 248 HP / 252 Atk / 8 SpA  
Naughty Nature  
- Icicle Spear  
- Surf  
- Tri Attack  
- Poison Jab  

Omastar @ White Herb  
Ability: Shell Armor  
EVs: 252 SpA / 4 SpD / 252 Spe  
Modest Nature  
IVs: 0 Atk  
- Surf  
- Ancient Power  
- Earth Power  
- Shell Smash  

Crustle @ Leftovers  
Ability: Shell Armor  
EVs: 252 HP / 252 Atk / 4 SpD  
Adamant Nature  
- Earthquake  
- Knock Off  
- X-Scissor  
- Stealth Rock  

Escavalier @ Life Orb  
Ability: Shell Armor  
EVs: 248 HP / 252 Atk / 8 SpD  
Adamant Nature  
- Knock Off  
- Swords Dance  
- Iron Head  
- Poison Jab  

Drednaw @ Aguav Berry  
Ability: Shell Armor  
EVs: 248 HP / 252 Atk / 8 SpD  
Adamant Nature  
- Liquidation  
- Earthquake  
- Poison Jab  
- Swords Dance  

Type: Null @ Eviolite  
Ability: Battle Armor  
EVs: 252 HP / 252 Atk / 4 SpD  
Adamant Nature  
- Facade  
- Sleep Talk  
- Shadow Claw  
- Rest  

"""

In [5]:
N_OUR_MOVE_ACTIONS = 4
N_OUR_SWITCH_ACTIONS = 5
N_OUR_ACTIONS = N_OUR_MOVE_ACTIONS + N_OUR_SWITCH_ACTIONS

ALL_OUR_ACTIONS = np.array(range(0, N_OUR_ACTIONS))

NAME_TO_ID_DICT = {
    "turtonator": 0,
    "lapras": 1,
    "armaldo": 2,
    "drapion": 3,
    "kabutops": 4,
    "falinks": 5,
    "cloyster": 6,
    "omastar": 7,
    "crustle": 8,
    "escavalier": 9,
    "drednaw": 10,
    "typenull": 11
}

In [6]:
# Max-damage player

class MaxDamagePlayer(Player):
    def choose_move(self, battle):
        if battle.available_moves:
            best_move = max(battle.available_moves, key=lambda move: move.base_power)
            return self.create_order(best_move)
        else:
            return self.choose_random_move(battle)

In [7]:
# SARSA player
class SARSAPlayer(PlayerSarsa):
    def __init__(self, battle_format, team, n0, gamma, lambda_):
        super().__init__(battle_format=battle_format, team=team)
        self.N = defaultdict(lambda: np.zeros(N_OUR_ACTIONS))
        self.Q = defaultdict(lambda: np.zeros(N_OUR_ACTIONS))
        self.E = defaultdict(lambda: np.zeros(N_OUR_ACTIONS))
        self.n0 = n0
        self.gamma = gamma
        self.state = None
        self.action = None
        self.lambda_ = lambda_

    def choose_move(self, battle):
        
        if self.state is not None:
            # observe R, next_state and next_action
            reward = self.compute_reward(battle)
            next_state = self.embed_battle(battle)
            next_action = self.choose_action(next_state)
            
            #alpha
            self.N[self.state][self.action] += 1
            alpha = 1.0 / self.N[self.state][self.action]
            
            ########################  Calculate Lambda SARSA
            delta = reward + self.gamma*self.Q[next_state][next_action] - self.Q[self.state][self.action]
    
            self.E[self.state][self.action] += 1
    
            for s, _ in self.Q.items():
                    self.Q[s][:] += alpha * delta * self.E[s][:]
                    self.E[s][:] *= self.lambda_ * self.gamma
            
            # S <- S'  A <- A' 
            self.state = next_state
            self.action = next_action
            # Update the policy
            self.policy = self.update_epsilon_greedy_policy(self.Q, self.n0, self.N)
            
        else:
            # S first initialization
            self.state = self.embed_battle(battle)
            # Choose action
            self.action = self.choose_action(self.state)

        # if the selected action is not possible, perform a random move instead
        if self.action == -1:
            return ForfeitBattleOrder()
        elif self.action < 4 and self.action < len(battle.available_moves) and not battle.force_switch:
            return self.create_order(battle.available_moves[self.action])
        elif 0 <= self.action - 4 < len(battle.available_switches):
            return self.create_order(battle.available_switches[self.action - 4])
        else:
            return self.choose_random_move(battle)

    def _battle_finished_callback(self, battle):
        self.E = defaultdict(lambda: np.zeros(N_OUR_ACTIONS))
        if use_neptune:
            run[f'N0: {self.n0} gamma: {self.gamma} win_acc'].log(self.n_won_battles / len(self._reward_buffer))

    
    ''' Helper functions '''
    #Function to choose the next action
    def choose_action(self, state):
        action_probs = self.policy(state)
        action = np.random.choice(ALL_OUR_ACTIONS, p=action_probs)
        #Return the action
        return int(action)


    # epsilon-greedy policy
    def pi(self, state):
        epsilon = self.n0 / (self.n0 + np.sum(self.N[state]))
        # let's get the greedy action. Ties must be broken arbitrarily
        greedy_action = np.random.choice(np.where(self.Q[state] == self.Q[state].max())[0])
        action_pick_probability = np.full(N_OUR_ACTIONS, epsilon / N_OUR_ACTIONS)
        action_pick_probability[greedy_action] += 1 - epsilon
        return np.random.choice(ALL_OUR_ACTIONS, p=action_pick_probability)

    # the embed battle is our state
    # 12 factors: our active mon, opponent's active mon, 4 moves base power, 4 moves multipliers, num fainted mons
    @staticmethod
    def embed_battle(battle):
        # -1 indicates that the move does not have a base power
        # or is not available
        moves_base_power = -np.ones(4)
        moves_dmg_multiplier = np.ones(4)
        for i, move in enumerate(battle.available_moves):
            moves_base_power[i] = (
                    move.base_power / 100
            )  # Simple rescaling to facilitate learning
            if move.type:
                moves_dmg_multiplier[i] = move.type.damage_multiplier(
                    battle.opponent_active_pokemon.type_1,
                    battle.opponent_active_pokemon.type_2,
                )

        # We count how many pokemons have not fainted in each team
        n_fainted_mon_team = (
            len([mon for mon in battle.team.values() if mon.fainted])
        )
        n_fainted_mon_opponent = (
            len([mon for mon in battle.opponent_team.values() if mon.fainted])
        )

        state = list()
        state.append(NAME_TO_ID_DICT[str(battle.active_pokemon).split(' ')[0]])
        state.append(NAME_TO_ID_DICT[str(battle.opponent_active_pokemon).split(' ')[0]])
        for move_base_power in moves_base_power:
            state.append('{0:.2f}'.format(move_base_power))
        for move_dmg_multiplier in moves_dmg_multiplier:
            state.append('{0:.2f}'.format(move_dmg_multiplier))
        state.append(n_fainted_mon_team)
        state.append(n_fainted_mon_opponent)

        return str(state)

    # Computing rewards
    def reward_computing_helper(
            self,
            battle: AbstractBattle,
            *,
            fainted_value: float = 0.15,
            hp_value: float = 0.15,
            number_of_pokemons: int = 6,
            starting_value: float = 0.0,
            status_value: float = 0.15,
            victory_value: float = 1.0
    ) -> float:
        # 1st compute
        if battle not in self._reward_buffer:
            self._reward_buffer[battle] = starting_value
        current_value = 0

        # Verify if pokemon have fainted or have status
        for mon in battle.team.values():
            current_value += mon.current_hp_fraction * hp_value
            if mon.fainted:
                current_value -= fainted_value
            elif mon.status is not None:
                current_value -= status_value

        current_value += (number_of_pokemons - len(battle.team)) * hp_value

        # Verify if opponent pokemon have fainted or have status
        for mon in battle.opponent_team.values():
            current_value -= mon.current_hp_fraction * hp_value
            if mon.fainted:
                current_value += fainted_value
            elif mon.status is not None:
                current_value += status_value

        current_value -= (number_of_pokemons - len(battle.opponent_team)) * hp_value

        # Verify if we won or lost
        if battle.won:
            current_value += victory_value
        elif battle.lost:
            current_value -= victory_value

        # Value to return
        to_return = current_value - self._reward_buffer[battle]
        self._reward_buffer[battle] = current_value
        if use_neptune:
            run[f'N0: {self.n0}, gamma: {self.gamma} reward_buffer'].log(current_value)
        return to_return

    # Calling reward_computing_helper
    def compute_reward(self, battle) -> float:
        return self.reward_computing_helper(battle, fainted_value=2, hp_value=1, victory_value=15)


In [8]:
# Q-learning validation player
class SARSAValidationPlayer(PlayerSarsa):
    def __init__(self, battle_format, team, Q):
        super().__init__(battle_format=battle_format, team=team)
        self.Q = Q

    def choose_move(self, battle):
        state = self.embed_battle(battle)
        # let's get the greedy action. Ties must be broken arbitrarily
        if state in self.Q.keys():
            action = np.random.choice(np.where(self.Q[state] == self.Q[state].max())[0])
        else:
            return self.choose_random_move(battle)

        # if the selected action is not possible, perform a random move instead
        if action == -1:
            return ForfeitBattleOrder()
        elif action < 4 and action < len(battle.available_moves) and not battle.force_switch:
            return self.create_order(battle.available_moves[action])
        elif 0 <= action - 4 < len(battle.available_switches):
            return self.create_order(battle.available_switches[action - 4])
        else:
            return self.choose_random_move(battle)

    def _battle_finished_callback(self, battle):
        pass

    # the embed battle is our state
    # 12 factors: our active mon, opponent's active mon, 4 moves base power, 4 moves multipliers, remaining mons
    @staticmethod
    def embed_battle(battle):
        # -1 indicates that the move does not have a base power
        # or is not available
        moves_base_power = -np.ones(4)
        moves_dmg_multiplier = np.ones(4)
        for i, move in enumerate(battle.available_moves):
            moves_base_power[i] = (
                    move.base_power / 100
            )  # Simple rescaling to facilitate learning
            if move.type:
                moves_dmg_multiplier[i] = move.type.damage_multiplier(
                    battle.opponent_active_pokemon.type_1,
                    battle.opponent_active_pokemon.type_2,
                )

        # We count how many pokemons have not fainted in each team
        remaining_mon_team = (
            len([mon for mon in battle.team.values() if mon.fainted])
        )
        remaining_mon_opponent = (
            len([mon for mon in battle.opponent_team.values() if mon.fainted])
        )

        state = list()
        state.append(NAME_TO_ID_DICT[str(battle.active_pokemon).split(' ')[0]])
        state.append(NAME_TO_ID_DICT[str(battle.opponent_active_pokemon).split(' ')[0]])
        for move_base_power in moves_base_power:
            state.append('{0:.2f}'.format(move_base_power))
        for move_dmg_multiplier in moves_dmg_multiplier:
            state.append('{0:.2f}'.format(move_dmg_multiplier))
        state.append(remaining_mon_team)
        state.append(remaining_mon_opponent)

        return str(state)


In [9]:
# global parameters

# possible values for num_battles (number of episodes)
n_battles_array = [10000]
# exploration schedule from MC, i. e., epsilon(t) = N0 / (N0 + N(S(t)))
n0_array = [0.0001, 0.001, 0.01]
# possible values for gamma (discount factor)
gamma_array = [0.75] 
#Lambda
lambda_ = [0, 0.2, 0.4, 0.6, 0.8, 1]

list_of_params = [
    {
        'n_battles': n_battles,
        'n0': n0,
        'gamma': gamma,
        'lambda': lambda_
    } for n_battles, n0, gamma, lambda_ in product(n_battles_array, n0_array, gamma_array, lambda_)
]

In [10]:
# json helper functions

def save_dict_to_json(path_dir, filename, data, append=True):
    if not os.path.exists(path_dir):
        os.makedirs(path_dir)
    full_filename = path_dir + "/" + filename
    if os.path.exists(full_filename) and append:
        with open(full_filename, "r") as file:
            value_dict = json.load(file)
            for key in data:
                value_dict[key] = data[key] if isinstance(data[key], list) else data[key].tolist()
            file.close()
    else:
        value_dict = dict()
        for key in data:
            value_dict[key] = data[key] if isinstance(data[key], list) else data[key].tolist()
    # write
    with open(full_filename, "w") as file:
        json.dump(value_dict, file)
        file.close()


def read_dict_from_json(path_dir, filename):
    full_filename = path_dir + "/" + filename
    if not os.path.exists(full_filename):
        return dict()
    file = open(full_filename, "r")
    data = json.load(file)
    file.close()
    return data

In [11]:
# let's battle!
async def lets_battle():
    for params in list_of_params:
        start = time.time()
        if use_neptune:
            run['params'] = params
        params['player'] = SARSAPlayer(battle_format="gen8ou", team=OUR_TEAM, n0=params['n0'], gamma=params['gamma'], lambda_ = params['lambda'])
        params['opponent'] = MaxDamagePlayer(battle_format="gen8ou", team=OP_TEAM)
        await params['player'].battle_against(opponent=params['opponent'], n_battles=params['n_battles'])
        if debug:
            print("training: num battles (episodes)=%d, N0=%f, gamma=%f, lambda=%f, wins=%d, winning %%=%f, total time=%s sec" %
                  (
                      params['n_battles'],
                      round(params['n0'], 2),
                      round(params['gamma'], 2),
                      round(params['lambda'], 2),
                      params['player'].n_won_battles,
                      round((params['player'].n_won_battles / params['n_battles']) * 100, 2),
                      round(time.time() - start, 2)
                  ))

        # save Q to json file
        if save_to_json_file:
            today_s = str(date.today())
            n_battle_s = str(params['n_battles'])
            n0_s = str(round(params['n0'], 8))
            gamma_s = str(round(params['gamma'], 8))
            lambda_s = str(round(params['lambda'], 8))
            winning_percentage_s = str(round((params['player'].n_won_battles / params['n_battles']) * 100, 2))
            filename = "SarsaLambda_" + today_s + "_" + n_battle_s + "_" + n0_s + "_" + gamma_s + "_" + lambda_s + "_" + winning_percentage_s + ".json"
            save_dict_to_json("./SarsaLambda10k_Table", filename, params['player'].Q, False)

        # statistics: key: "n_battles, n0, gamma", values: list of win or lose
        key = str(params['n_battles']) + "_" + str(round(params['n0'], 2)) + "_" + str(round(params['gamma'], 2)) + "_" + str(round(params['lambda'], 2))
        winning_status = list()
        for battle in params['player']._battles.values():
            if battle.won:
                winning_status.append(True)
            else:
                winning_status.append(False)
        # save statistics json file (append)
        data = dict()
        data[key] = winning_status
        save_dict_to_json("./statistics", "statistics.json", data)


In [12]:
loop = asyncio.get_event_loop()
loop.run_until_complete(loop.create_task(lets_battle()))

training: num battles (episodes)=10000, N0=0.000000, gamma=0.750000, lambda=0.000000, wins=4750, winning %=47.500000, total time=3422.37 sec
training: num battles (episodes)=10000, N0=0.000000, gamma=0.750000, lambda=0.200000, wins=4513, winning %=45.130000, total time=3468.67 sec




training: num battles (episodes)=10000, N0=0.000000, gamma=0.750000, lambda=0.400000, wins=3410, winning %=34.100000, total time=3686.08 sec




training: num battles (episodes)=10000, N0=0.000000, gamma=0.750000, lambda=0.600000, wins=2538, winning %=25.380000, total time=3868.77 sec




training: num battles (episodes)=10000, N0=0.000000, gamma=0.750000, lambda=0.800000, wins=996, winning %=9.960000, total time=3610.57 sec




training: num battles (episodes)=10000, N0=0.000000, gamma=0.750000, lambda=1.000000, wins=372, winning %=3.720000, total time=3275.5 sec


Error occurred during asynchronous operation processing: Timestamp must be non-decreasing for series attribute: N0: 0.001 gamma: 0.75 win_acc. Invalid point: 2021-11-06T02:34:26.864Z
Error occurred during asynchronous operation processing: Timestamp must be non-decreasing for series attribute: N0: 0.001 gamma: 0.75 win_acc. Invalid point: 2021-11-06T02:34:27.249Z
Error occurred during asynchronous operation processing: Timestamp must be non-decreasing for series attribute: N0: 0.001 gamma: 0.75 win_acc. Invalid point: 2021-11-06T02:34:27.472Z
Error occurred during asynchronous operation processing: Timestamp must be non-decreasing for series attribute: N0: 0.001 gamma: 0.75 win_acc. Invalid point: 2021-11-06T02:34:27.795Z
Error occurred during asynchronous operation processing: Timestamp must be non-decreasing for series attribute: N0: 0.001 gamma: 0.75 win_acc. Invalid point: 2021-11-06T02:34:28.173Z
Error occurred during asynchronous operation processing: Timestamp must be non-decrea

training: num battles (episodes)=10000, N0=0.000000, gamma=0.750000, lambda=0.000000, wins=4588, winning %=45.880000, total time=3313.15 sec




training: num battles (episodes)=10000, N0=0.000000, gamma=0.750000, lambda=0.200000, wins=4805, winning %=48.050000, total time=3302.13 sec




training: num battles (episodes)=10000, N0=0.000000, gamma=0.750000, lambda=0.400000, wins=3624, winning %=36.240000, total time=3590.78 sec




training: num battles (episodes)=10000, N0=0.000000, gamma=0.750000, lambda=0.600000, wins=2461, winning %=24.610000, total time=3613.5 sec




training: num battles (episodes)=10000, N0=0.000000, gamma=0.750000, lambda=0.800000, wins=884, winning %=8.840000, total time=3615.79 sec




training: num battles (episodes)=10000, N0=0.000000, gamma=0.750000, lambda=1.000000, wins=456, winning %=4.560000, total time=3358.37 sec




training: num battles (episodes)=10000, N0=0.010000, gamma=0.750000, lambda=0.000000, wins=4557, winning %=45.570000, total time=3330.85 sec




training: num battles (episodes)=10000, N0=0.010000, gamma=0.750000, lambda=0.200000, wins=4442, winning %=44.420000, total time=3341.0 sec




training: num battles (episodes)=10000, N0=0.010000, gamma=0.750000, lambda=0.400000, wins=3000, winning %=30.000000, total time=3621.49 sec




training: num battles (episodes)=10000, N0=0.010000, gamma=0.750000, lambda=0.600000, wins=2747, winning %=27.470000, total time=3752.52 sec




training: num battles (episodes)=10000, N0=0.010000, gamma=0.750000, lambda=0.800000, wins=1146, winning %=11.460000, total time=3554.63 sec




training: num battles (episodes)=10000, N0=0.010000, gamma=0.750000, lambda=1.000000, wins=495, winning %=4.950000, total time=3411.39 sec


In [13]:
if use_neptune: run.stop()

Shutting down background jobs, please wait a moment...
Done!


Waiting for the remaining 308 operations to synchronize with Neptune. Do not kill this process.


All 308 operations synced, thanks for waiting!


In [14]:
# plotting helper functions

def plot_2d(path, title, x_label, x_array, y_label, y_array):
    # set labels and plot surface
    figure = matplotlib.pyplot.figure(figsize=(20, 10))
    ax = figure.gca()
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.plot(x_array, y_array)
    # pyplot.show()
    if not os.path.exists(path):
        os.makedirs(path)
    filename = path + "/" + title + "_" + x_label + "_" + y_label + "_" + ".pdf"
    figure.savefig(filename, dpi=figure.dpi)
    pyplot.close(figure)

def plot_3d(path, title, x_label, x_array, y_label, y_array, z_label, z_array):
    xyz = {'x': x_array, 'y': y_array, 'z': z_array}
    df = pd.DataFrame(xyz, index=range(len(xyz['x'])))
    xv, yv = np.meshgrid(x_array, y_array)
    zv = griddata((df['x'], df['y']), df['z'], (xv, yv), method='nearest')
    # set labels and plot surface
    figure = matplotlib.pyplot.figure(figsize=(20, 10))
    ax = figure.gca(projection='3d')
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.set_zlabel(z_label)
    surface = ax.plot_surface(xv, yv, zv, rstride=1, cstride=1, cmap=matplotlib.cm.coolwarm, linewidth=0,
                              antialiased=False)
    figure.colorbar(surface)
    # pyplot.show()
    if not os.path.exists(path):
        os.makedirs(path)
    filename = path + "/" + title + ".pdf"
    figure.savefig(filename, dpi=figure.dpi)
    pyplot.close(figure)


In [15]:
# plot value from state-action pair

def plot_v_from_state_action_json(path_dir, x_label, x_func, y_label, y_func):
    # open files
    for filename in os.listdir(path_dir):
        q = dict()
        q_json = read_dict_from_json(path_dir, filename)
        for key in q_json.keys():
            q[key] = np.array(q_json[key])
        x_values = []
        y_values = []
        z_values = []
        # for state, actions
        for state, actions in q.items():
            state = re.sub(r"[,!?><:'\[\]()@*~#]", "", state)
            key_float = [float(k) for k in state.split()]
            x_emb = x_func(key_float)
            x_values.append(x_emb)
            y_emb = y_func(key_float)
            y_values.append(y_emb)
            action_value = np.max(actions)
            z_values.append(action_value)
        # plot 3D
        title = "v_from_" + filename
        plot_3d("./plot", title, x_label, x_values, y_label, y_values, "V", z_values)


# plots from Q
plot_v_from_state_action_json(path_dir="./SarsaLambda10k_Table",
                              x_label="20 * index_pokemon + sum(moves_base_power * moves_dmg_multiplier)",
                              x_func=lambda k: 20 * k[0] + k[1] * k[5] + k[2] * k[6] + k[3] * k[7] + k[4] * k[8],
                              y_label="remaining_mon_team - remaining_mon_opponent",
                              y_func=lambda k: k[8] - k[9])

MemoryError: 

MemoryError: Unable to allocate 564. MiB for an array with shape (4, 18490000) and data type float64

<Figure size 1440x720 with 2 Axes>

In [16]:
# plot additional statistics

def plot_statistics_json(path_dir, filename="statistics.json"):
    statistics = read_dict_from_json(path_dir, filename)
    # win/lost vs. episode number
    for key in statistics.keys():
        key_elements = key.split("_")
        n_battles = key_elements[0]
        n0 = key_elements[1]
        gamma = key_elements[2]
        value = statistics[key]
        plot_2d(path="./plot",
                title="victory_n_battles_" + n_battles + "_N0_" + n0 + "_gamma_" + gamma,
                x_label="episodes",
                x_array=np.array(range(0, len(value))),
                y_label="victory",
                y_array=np.array(value).astype(int))

    # winning % by set of parameters
    n_battles = ""
    x_values = []
    y_values = []
    z_values = []
    for key in statistics.keys():
        key_elements = key.split("_")
        n_battles = key_elements[0]
        n0 = key_elements[1]
        gamma = key_elements[2]
        value = statistics[key]
        x_values.append(n0)
        y_values.append(gamma)
        z_values.append(value.count(True) / len(value))
    plot_3d(path="./plot",
            title="winning_percentage_n_battles_" + n_battles,
            x_label="N0",
            x_array=np.array(x_values).astype(np.float),
            y_label="gamma",
            y_array=np.array(y_values).astype(np.float),
            z_label="winning %",
            z_array=np.array(z_values))


# plots from statistics
plot_statistics_json("./statistics")

In [23]:
# validation

async def do_battle_validation():
    for params in list_of_params:
        # validation (play 1/3 of the battles using Q-learned table)
        start = time.time()
        #params['player'] = SARSAPlayer(battle_format="gen8ou", team=OUR_TEAM, n0=params['n0'], gamma=params['gamma'], lambda_ = params['lambda'])
        params['opponent'] = MaxDamagePlayer(battle_format="gen8ou", team=OP_TEAM)
        params['validation_player'] = SARSAValidationPlayer(battle_format="gen8ou", team=OUR_TEAM, Q=params['player'].Q)
        n_battles = int(params['n_battles'] / 3)
        await params['validation_player'].battle_against(opponent=params['opponent'], n_battles=n_battles)
        print("validation: num battles (episodes)=%d, N0=%f, gamma=%f, lambda=%f, wins=%d, winning %%=%f, total time=%s sec" %
                  (
                      n_battles,
                      round(params['n0'], 5),
                      round(params['gamma'], 2),
                      round(params['lambda'], 2),
                      params['validation_player'].n_won_battles,
                      round((params['validation_player'].n_won_battles / n_battles) * 100, 2),
                      round(time.time() - start, 2)
                  ))

loop = asyncio.get_event_loop()
loop.run_until_complete(loop.create_task(do_battle_validation()))



validation: num battles (episodes)=10000, N0=0.000100, gamma=0.750000, lambda=0.800000, wins=52, winning %=0.520000, total time=1626.19 sec
validation: num battles (episodes)=10000, N0=0.000100, gamma=0.750000, lambda=0.000000, wins=52, winning %=0.520000, total time=1825.28 sec
validation: num battles (episodes)=10000, N0=0.000100, gamma=0.750000, lambda=0.000000, wins=52, winning %=2.600000, total time=1836.79 sec




validation: num battles (episodes)=10000, N0=0.000100, gamma=0.750000, lambda=1.000000, wins=48, winning %=0.480000, total time=1888.0 sec
validation: num battles (episodes)=10000, N0=0.000100, gamma=0.750000, lambda=0.200000, wins=47, winning %=0.470000, total time=1877.51 sec
validation: num battles (episodes)=10000, N0=0.000100, gamma=0.750000, lambda=0.200000, wins=50, winning %=2.500000, total time=1873.88 sec




validation: num battles (episodes)=10000, N0=0.001000, gamma=0.750000, lambda=0.000000, wins=52, winning %=0.520000, total time=1868.28 sec
validation: num battles (episodes)=10000, N0=0.000100, gamma=0.750000, lambda=0.400000, wins=48, winning %=0.480000, total time=1865.71 sec
validation: num battles (episodes)=10000, N0=0.000100, gamma=0.750000, lambda=0.400000, wins=50, winning %=2.500000, total time=1871.14 sec




validation: num battles (episodes)=10000, N0=0.001000, gamma=0.750000, lambda=0.200000, wins=36, winning %=0.360000, total time=1871.86 sec
validation: num battles (episodes)=10000, N0=0.000100, gamma=0.750000, lambda=0.600000, wins=41, winning %=0.410000, total time=1871.9 sec
validation: num battles (episodes)=10000, N0=0.000100, gamma=0.750000, lambda=0.600000, wins=42, winning %=2.100000, total time=1860.93 sec




validation: num battles (episodes)=10000, N0=0.001000, gamma=0.750000, lambda=0.400000, wins=52, winning %=0.520000, total time=1876.72 sec
validation: num battles (episodes)=10000, N0=0.000100, gamma=0.750000, lambda=0.800000, wins=60, winning %=0.600000, total time=1869.87 sec
validation: num battles (episodes)=10000, N0=0.000100, gamma=0.750000, lambda=0.800000, wins=62, winning %=3.100000, total time=1863.15 sec




validation: num battles (episodes)=10000, N0=0.001000, gamma=0.750000, lambda=0.600000, wins=57, winning %=0.570000, total time=1988.57 sec
validation: num battles (episodes)=10000, N0=0.000100, gamma=0.750000, lambda=1.000000, wins=45, winning %=0.450000, total time=1714.38 sec
validation: num battles (episodes)=10000, N0=0.000100, gamma=0.750000, lambda=1.000000, wins=45, winning %=2.250000, total time=1684.52 sec
validation: num battles (episodes)=10000, N0=0.001000, gamma=0.750000, lambda=0.800000, wins=47, winning %=0.470000, total time=225.37 sec
validation: num battles (episodes)=10000, N0=0.001000, gamma=0.750000, lambda=0.000000, wins=47, winning %=2.350000, total time=223.36 sec
validation: num battles (episodes)=10000, N0=0.001000, gamma=0.750000, lambda=0.000000, wins=47, winning %=0.470000, total time=232.46 sec




validation: num battles (episodes)=10000, N0=0.001000, gamma=0.750000, lambda=1.000000, wins=41, winning %=0.410000, total time=237.36 sec
validation: num battles (episodes)=10000, N0=0.001000, gamma=0.750000, lambda=0.200000, wins=50, winning %=0.500000, total time=232.56 sec
validation: num battles (episodes)=10000, N0=0.001000, gamma=0.750000, lambda=0.200000, wins=50, winning %=2.500000, total time=236.9 sec
validation: num battles (episodes)=10000, N0=0.010000, gamma=0.750000, lambda=0.000000, wins=61, winning %=0.610000, total time=231.53 sec
validation: num battles (episodes)=10000, N0=0.001000, gamma=0.750000, lambda=0.400000, wins=44, winning %=0.440000, total time=237.23 sec
validation: num battles (episodes)=10000, N0=0.001000, gamma=0.750000, lambda=0.400000, wins=45, winning %=2.250000, total time=241.45 sec
validation: num battles (episodes)=10000, N0=0.010000, gamma=0.750000, lambda=0.200000, wins=50, winning %=0.500000, total time=243.85 sec
validation: num battles (epi



validation: num battles (episodes)=10000, N0=0.010000, gamma=0.750000, lambda=0.600000, wins=54, winning %=0.540000, total time=1608.64 sec
validation: num battles (episodes)=10000, N0=0.001000, gamma=0.750000, lambda=1.000000, wins=40, winning %=0.400000, total time=1599.44 sec
validation: num battles (episodes)=10000, N0=0.001000, gamma=0.750000, lambda=1.000000, wins=44, winning %=2.200000, total time=1602.42 sec
validation: num battles (episodes)=10000, N0=0.010000, gamma=0.750000, lambda=0.800000, wins=39, winning %=0.390000, total time=251.06 sec
validation: num battles (episodes)=10000, N0=0.010000, gamma=0.750000, lambda=0.000000, wins=52, winning %=0.520000, total time=256.69 sec
validation: num battles (episodes)=10000, N0=0.010000, gamma=0.750000, lambda=0.000000, wins=55, winning %=2.750000, total time=259.99 sec
validation: num battles (episodes)=10000, N0=0.010000, gamma=0.750000, lambda=1.000000, wins=54, winning %=0.540000, total time=239.93 sec
validation: num battles 



validation: num battles (episodes)=10000, N0=0.010000, gamma=0.750000, lambda=1.000000, wins=43, winning %=0.430000, total time=186.91 sec
validation: num battles (episodes)=10000, N0=0.010000, gamma=0.750000, lambda=1.000000, wins=51, winning %=2.550000, total time=183.47 sec


