In [411]:
import chainer
import chainer.functions as F
import chainer.links as L
import chainerrl
from chainerrl import explorers

In [439]:
import pygame
import sys
from time import sleep
import random
import math
import numpy as np
from gym import spaces

# COLORS
white = (255, 255, 255)
black = (0, 0, 0)
red = (255, 0, 0)
green = (0, 255, 0)
blue = (0, 0, 255)
purple = (255, 0, 255)

# SIZES
AGENT_SIZE = 50
WEAPON_SIZE = 20

class Entity:
    def __init__(self, name, xy, angle, speed, game_dims=(1000,800)):
        self.name = name
        self.x, self.y = xy
        self.speed = speed
        self.angle = math.radians(-angle)  # -1 to 1 
        self.dx = self.speed * math.cos(self.angle)
        self.dy = self.speed * math.sin(self.angle)
        self.dimx, self.dimy = game_dims
        
    def update(self, agent_xy):
        self.x += self.dx
        self.y += self.dy

        agent_x, agent_y = agent_xy

        has_hit_x = self.x >= agent_x - WEAPON_SIZE and self.x <= agent_x + AGENT_SIZE
        has_hit_y = self.y >= agent_y - WEAPON_SIZE and self.y <= agent_y + AGENT_SIZE
        
        exit_boundary = self.x > self.dimx-50 or self.x < 000 or self.y > self.dimy-50 or self.y < 0

        return has_hit_x and has_hit_y or exit_boundary
    
    def __repr__(self):
        return self.name + str((self.x,self.y))


class Agent:
    def __init__(self, xy=(400,100), game_dims=(1000,800), show=False):
        self.jumps = 0
        self.maxJumps = 2
        self.xpos, self.ypos = xy
        self.touchingObst = 0
        self.gravityPull = 0.5
        self.gravityCurrent = 0
        self.xCurrent = 0
        self.show = show
        self.dimx, self.dimy = game_dims
    def jump(self):
        if self.jumps < self.maxJumps:
            self.gravityCurrent = -10
            self.jumps = self.jumps + 1
    def left(self):
        if self.touchingObst == 0:
            self.xCurrent = -10
    def right(self):
        if self.touchingObst == 0:
            self.xCurrent = 10
    def update(self):
        # CONTROL GRAVITY
        self.gravityCurrent = self.gravityCurrent + self.gravityPull

        # RATE OF DECREASE OF LEFT/RIGHT MOVEMENTS
        if self.xCurrent > 0:
            self.xCurrent = self.xCurrent - 0.5
        if self.xCurrent < 0:
            self.xCurrent = self.xCurrent + 0.5

        # UPDATE XY COORDINATES
        self.ypos = self.ypos + self.gravityCurrent
        self.xpos = self.xpos + self.xCurrent

        # BOUNDARIES
        if self.xpos > self.dimx-50:
            self.xpos = self.dimx-50
        if self.xpos < 000:
            self.xpos = 000
        if self.ypos > self.dimy-50:
            self.ypos = self.dimy-50+1
            self.gravityCurrent = 0
            self.jumps = 0
            
    def display(self, gameDisplay):
        if self.show:
            pygame.draw.rect(gameDisplay, red, (self.xpos, self.ypos, AGENT_SIZE, AGENT_SIZE))
    def act(self, agent_action):
        # print(agent_action)
        if agent_action == 0:
            self.left()
        elif agent_action == 1:
            self.right()
        elif agent_action == 2:
            self.jump()
        self.update()
            

class Env:
    def __init__(self, 
                 game_dims=(1000, 800),
                 show=False):
        self.dimx, self.dimy = game_dims
        self.agent = Agent((400,100), show=show, game_dims=game_dims)
        self.set_default_rewards()
        self.observation_space = 5
        self.show = show
        pygame.init()
        self.play = True
        
        # GAME DIMENSIONS
        self.game_dims = game_dims
        self.generator_action_space = spaces.Box(np.array([0,0,0,0]), np.array([2,self.dimx,self.dimy,360]), dtype=np.float32)
        self.agent_action_space = spaces.Discrete(3)
        min_obs = np.array([0]*5 + [0,0,5,0]*5)
        max_obs = np.array([2,self.dimx,self.dimy,1,self.dimy*2] + [self.dimx,self.dimy,50,360]*5)
        self.observation_space = spaces.Box(min_obs, max_obs, dtype=np.float32)
        
        # DELAY FOR WEAPON ENTITIES
        self.delay = 0
#         self.entity_list = []
        
        self.entity_limit = 5
        self.entity_free_keys = [0,1,2,3,4]
        self.entity_dict = {}
    def get_free_key(self):
        if len(self.entity_free_keys) > 0:
            return self.entity_free_keys.pop(0)
        else:
            return None
    
    # CONTROL MOVEMENTS
    def execute(self):
        agent_action = None
        weapon_action = (0,0,0)
        
        events = pygame.event.get()
        for event in events:
            if event.type == pygame.KEYDOWN:
                if event.key == pygame.K_SPACE:
                    agent_action = 2
                if event.key == pygame.K_LEFT and self.agent.touchingObst == 0:
                    agent_action = 0
                if event.key == pygame.K_RIGHT and self.agent.touchingObst == 0:
                    agent_action = 1
            if event.type == pygame.QUIT:
                pygame.display.quit()
                agent_action = -1
                
        action = (agent_action, weapon_action)
        return action
            
    def test_agent(self):
        """FOR TESTING OF AGENT ACTIONS ONLY"""
        run = True
        while run:
            sleep(0.01)
            action = self.execute()
            if action == -1:
                break
            self.step(action)
            
    def create_entity(self, weapon_action):
        # print(weapon_action)
        wep_type, wep_x, wep_y, angle = weapon_action
        wep_xy = (wep_x, wep_y)
#         print(wep_xy)
        if self.delay != 0:
            self.delay -= 1
        wep_type = 1
        if wep_type == 1 and self.delay == 0:
            ent = Entity(str(wep_type), wep_xy, angle, 10)
            ent_key = self.get_free_key()
            if ent_key != None:
                self.entity_dict[ent_key] = ent
            self.delay = 20  # DELAY BEFORE THE NEXT ATTACK
    def update_entities(self):
        # UPDATE ENTITIES
        collided = []
        for key, ent in self.entity_dict.items():
            collide = ent.update((self.agent.xpos, self.agent.ypos))
            if not collide:
                if self.show:
#                     print(ent.x, ent.y)
                    pygame.draw.rect(self.gameDisplay, blue, (ent.x, ent.y, WEAPON_SIZE, WEAPON_SIZE))
            else:
                self.agent.agent_reward = -20
                self.generator_reward = 20
                collided.append(key)
        
        for ent_key in collided:
            self.entity_free_keys.append(ent_key)
            del self.entity_dict[ent_key]
    def display_game(self):
        if self.show:
#             print(self.game_dims)
            self.gameDisplay = pygame.display.set_mode(self.game_dims, 0, 32)
            self.gameDisplay.fill(white)
    def display_background(self):
        # DISPLAY BACKGROUND
        if self.show:
            pygame.font.init()
            myFont = pygame.font.SysFont('Futura PT Light', 60)
            textsurface = myFont.render('The Chosen One', False, black)
            self.gameDisplay.blit(textsurface, (200,200))
            pygame.display.update()
    def set_default_rewards(self):
        self.agent.agent_reward = 1
        self.generator_reward = -1

    def step(self, action):
        # SET DEFAULT REWARDS FOR AGENT AND GENERATOR
        self.set_default_rewards()
        
        # DISPLAY GAME
        self.display_game()
        
        agent_action, weapon_action = action
        
        # MOVE THE AGENT
        self.agent.act(agent_action)
        if self.show:
            self.agent.display(self.gameDisplay)
        
        # CREATE WEAPON ENTITY
        self.create_entity(weapon_action)
        
        # UPDATE ENTITIES
        self.update_entities()
        
        # DISPLAY BACKGROUND
        self.display_background()
        
        """RETURNS:
        reward - (agent_reward, generator_reward)
        state - getGameState()
        done - CURRENT: DEFAULT: False
        done - TODO: whether game is completed, e.g. HP <= 0
        """
        reward = (self.agent.agent_reward, self.generator_reward)
        state = self.getGameState()
        
        return (reward, state, False)
        
    def getGameState(self):
        a = self.agent
        agent_values = np.array([
            a.jumps,
            a.xpos//1000,
            a.ypos//1000,
            a.touchingObst,
            a.gravityCurrent,
            # TODO: height, width, dy, dx, direction, bounding box
        ])
        entity_values = np.array([])
        for i in range(5):
            if i in self.entity_dict:
                e = self.entity_dict[i]
                vals = [e.x//1000, e.y//1000, e.speed, e.angle]
            else:
                vals = [0,0,0,0]
            entity_values = np.append(entity_values, vals)

        values = np.append(agent_values, entity_values)
        return values
    
    def reset(self):
        """Resets the game. Returns (reward, state, done)."""
        self.__init__(game_dims=self.game_dims, show=self.show)
        return self.getGameState()

    def test_step(self):
        # Create Gun at random place and angles
        agent_action = random.randint(0,2)

        wep_type = 1  # gun
        wep_xy = (50, 700)  # coordinate appears at
        angle = 0

        generator_action = (wep_type, wep_xy, angle)
        action = (agent_action, generator_action)
        self.step(action)

In [440]:
env = Env(game_dims=(700, 500), show=False)

# SHOW ENVIRONMENT VALUES
print('Observation space:', env.observation_space)
print('Agent action space:', env.agent_action_space)
print('Generator action space:', env.generator_action_space)

state = env.reset()
print("State:", state)

Observation space: Box(25,)
Agent action space: Discrete(3)
Generator action space: Box(4,)
State: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]


In [441]:
# Create Gun at random place and angles
agent_action = random.randint(0,2)

wep_type = 1  # gun
wep_xy = (50, 700)
wep_x, wep_y = wep_xy
angle = 20

generator_action = (wep_type, wep_x, wep_y, angle)
action = (agent_action, generator_action)
reward, state, done = env.step(action)
print("Agent Reward:", reward[0])
print("Generator Reward:", reward[1])
print("State:", state)
print("Done:", done)

Agent Reward: 1
Generator Reward: -1
State: [ 0.          0.          0.          0.          0.5         0.
  0.         10.         -0.34906585  0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.        ]
Done: False


In [442]:
# FOR CONTINUOUS ACTION SPACE
obs_space = env.observation_space
obs_size = obs_space.low.size

In [443]:
"""AGENT HYPERPAREMETERS"""
agent_action_space = env.agent_action_space
agent_action_size = agent_action_space.n

# Q FUNCTION AND ADAM OPTIMIZER
agent_q_func = chainerrl.q_functions.FCStateQFunctionWithDiscreteAction(
    obs_size, agent_action_size,
    n_hidden_layers=5, n_hidden_channels=100)

# Use Adam to optimize q_func. eps=1e-2 is for stability.
agent_optimizer = chainer.optimizers.Adam(eps=1e-2)
agent_optimizer.setup(agent_q_func)

# Set the discount factor that discounts future rewards.
agent_gamma = 0.95

# Use epsilon-greedy for exploration
agent_explorer = chainerrl.explorers.ConstantEpsilonGreedy(
    epsilon=0.3, random_action_func=env.agent_action_space.sample)

# DQN uses Experience Replay.
# Specify a replay buffer and its capacity.
agent_replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10 ** 6)

# Since observations from CartPole-v0 is numpy.float64 while
# Chainer only accepts numpy.float32 by default, specify
# a converter as a feature extractor function phi.
agent_phi = lambda x: x.astype(np.float32, copy=False)

# CHOSEN ONE AGENT
chosen_one = chainerrl.agents.DoubleDQN(
    agent_q_func, agent_optimizer, agent_replay_buffer, agent_gamma, agent_explorer,
    replay_start_size=500, update_interval=1,
    target_update_interval=100, phi=agent_phi)

In [444]:
"""GENERATOR HYPERPARAMETERS"""
generator_action_space = env.generator_action_space
generator_action_size = generator_action_space.low.size

# Q FUNCTION FOR CONTINUOUS VARIABLES
generator_q_func = chainerrl.q_functions.FCQuadraticStateQFunction(
    obs_size, generator_action_size,
    n_hidden_layers=5,
    n_hidden_channels=100,
    action_space=generator_action_space,
)

# Use the Ornstein-Uhlenbeck process for exploration
generator_ou_sigma = (generator_action_space.high - generator_action_space.low) * 0.2
generator_explorer = explorers.AdditiveOU(sigma=generator_ou_sigma)

# Use Adam to optimize q_func. eps=1e-2 is for stability.
generator_optimizer = chainer.optimizers.Adam(eps=1e-2)
generator_optimizer.setup(generator_q_func)

# Set the discount factor that discounts future rewards.
generator_gamma = 0.95

# DQN uses Experience Replay.
# Specify a replay buffer and its capacity.
generator_replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10 ** 6)

# Since observations from CartPole-v0 is numpy.float64 while
# Chainer only accepts numpy.float32 by default, specify
# a converter as a feature extractor function phi.
generator_phi = lambda x: x.astype(np.float32, copy=False)

# GENERATOR AGENT
generator = chainerrl.agents.DoubleDQN(
    generator_q_func, generator_optimizer, generator_replay_buffer, generator_gamma, generator_explorer,
    replay_start_size=500, update_interval=1,
    target_update_interval=100, phi=generator_phi)

In [449]:
# MAIN TRAINING LOOP
n_episodes = 1000
max_episode_len = 500

R_agent_history = []
R_generator_history = []
for i in range(1, n_episodes + 1):
    obs = env.reset()
    agent_reward = 0
    generator_reward = 0
    done = False
    R_agent = 0  # return (sum of rewards)
    R_generator = 0
    t = 0  # time step
    while not done and t < max_episode_len:
        # Uncomment to watch the behaviour
        # env.render()
#         print(obs)
        agent_action = chosen_one.act_and_train(obs, agent_reward)
        generator_action = generator.act_and_train(obs, generator_reward)
        action = (agent_action, generator_action)
        
        (reward, obs, done) = env.step(action)
        agent_reward, generator_reward = reward

        R_agent += agent_reward
        R_generator += generator_reward
        R_agent_history.append(R_agent)
        R_generator_history.append(R_generator)
        t += 1
    if i % 1 == 0:
        print('ep:', i,
              'R_agent:', R_agent,
#               'R_generator:', R_generator,
              'agent stat:', chosen_one.get_statistics())
#               'generator statistics:', generator.get_statistics())
    chosen_one.stop_episode_and_train(obs, agent_reward, done)
    generator.stop_episode_and_train(obs, generator_reward, done)
print('Finished.')

ep: 1 R_agent: 17 agent stat: [('average_q', 8.933350225869424), ('average_loss', 1.0534251732671915), ('n_updates', 2219)]
ep: 2 R_agent: -4 agent stat: [('average_q', 10.506435539273403), ('average_loss', 0.932203403650184), ('n_updates', 2719)]
ep: 3 R_agent: 17 agent stat: [('average_q', 12.344744379204764), ('average_loss', 1.0980929290231225), ('n_updates', 3219)]
ep: 4 R_agent: 17 agent stat: [('average_q', 13.576695332067278), ('average_loss', 1.013638625438399), ('n_updates', 3719)]
ep: 5 R_agent: -4 agent stat: [('average_q', 14.52009680853569), ('average_loss', 0.9688254540884538), ('n_updates', 4219)]
ep: 6 R_agent: -25 agent stat: [('average_q', 15.15822366882791), ('average_loss', 1.0232812214119211), ('n_updates', 4719)]
ep: 7 R_agent: -4 agent stat: [('average_q', 15.958815863795643), ('average_loss', 1.0872832611125127), ('n_updates', 5219)]
ep: 8 R_agent: 38 agent stat: [('average_q', 16.49151832260835), ('average_loss', 0.8900419801770801), ('n_updates', 5719)]
ep: 9

ep: 66 R_agent: -4 agent stat: [('average_q', 18.72663652011142), ('average_loss', 1.0083425684623617), ('n_updates', 34719)]
ep: 67 R_agent: -4 agent stat: [('average_q', 18.505120745213155), ('average_loss', 0.9331795591462033), ('n_updates', 35219)]
ep: 68 R_agent: 59 agent stat: [('average_q', 18.560333647903477), ('average_loss', 0.8831067291894643), ('n_updates', 35719)]
ep: 69 R_agent: 38 agent stat: [('average_q', 18.636665015593653), ('average_loss', 0.9985464954495662), ('n_updates', 36219)]
ep: 70 R_agent: 17 agent stat: [('average_q', 18.662380964825445), ('average_loss', 0.9212857511439155), ('n_updates', 36719)]
ep: 71 R_agent: 17 agent stat: [('average_q', 18.732654218912696), ('average_loss', 0.9476222100051656), ('n_updates', 37219)]
ep: 72 R_agent: -4 agent stat: [('average_q', 18.87695326809437), ('average_loss', 0.9956752825392113), ('n_updates', 37719)]
ep: 73 R_agent: -25 agent stat: [('average_q', 18.958908371935173), ('average_loss', 0.9158203108051454), ('n_upd

ep: 131 R_agent: -4 agent stat: [('average_q', 18.67340076371605), ('average_loss', 0.9278900443852972), ('n_updates', 67219)]
ep: 132 R_agent: -4 agent stat: [('average_q', 18.586607901343907), ('average_loss', 0.8911274551498871), ('n_updates', 67719)]
ep: 133 R_agent: -4 agent stat: [('average_q', 18.59184399179645), ('average_loss', 0.9284872176811455), ('n_updates', 68219)]
ep: 134 R_agent: -4 agent stat: [('average_q', 18.6682021738786), ('average_loss', 0.877195369493544), ('n_updates', 68719)]
ep: 135 R_agent: -4 agent stat: [('average_q', 18.755993919051253), ('average_loss', 0.9682994872398196), ('n_updates', 69219)]
ep: 136 R_agent: 17 agent stat: [('average_q', 18.64856939410791), ('average_loss', 0.8840378921963699), ('n_updates', 69719)]
ep: 137 R_agent: 17 agent stat: [('average_q', 18.641548435910792), ('average_loss', 0.9868365002981226), ('n_updates', 70219)]
ep: 138 R_agent: 17 agent stat: [('average_q', 18.49964365222028), ('average_loss', 0.9279616659980763), ('n_u

ep: 196 R_agent: 17 agent stat: [('average_q', 18.830592782140034), ('average_loss', 0.9566733643311631), ('n_updates', 99719)]
ep: 197 R_agent: 17 agent stat: [('average_q', 18.79807830420804), ('average_loss', 0.9354404402305201), ('n_updates', 100219)]
ep: 198 R_agent: 17 agent stat: [('average_q', 18.721273945954138), ('average_loss', 0.8758224790027741), ('n_updates', 100719)]
ep: 199 R_agent: -25 agent stat: [('average_q', 18.73524997700443), ('average_loss', 1.0059976463554574), ('n_updates', 101219)]
ep: 200 R_agent: 17 agent stat: [('average_q', 18.769201488752774), ('average_loss', 0.8961598974102798), ('n_updates', 101719)]
ep: 201 R_agent: 17 agent stat: [('average_q', 18.709618874303352), ('average_loss', 0.9378841436380979), ('n_updates', 102219)]
ep: 202 R_agent: -25 agent stat: [('average_q', 18.748610062854645), ('average_loss', 0.9698574068990411), ('n_updates', 102719)]
ep: 203 R_agent: 17 agent stat: [('average_q', 18.783473567038197), ('average_loss', 0.94473548108

ep: 260 R_agent: 38 agent stat: [('average_q', 18.728431424828155), ('average_loss', 0.9029930430179026), ('n_updates', 131719)]
ep: 261 R_agent: -4 agent stat: [('average_q', 18.62135305097647), ('average_loss', 1.0542753894646184), ('n_updates', 132219)]
ep: 262 R_agent: 17 agent stat: [('average_q', 18.635875528720888), ('average_loss', 0.9628180337040741), ('n_updates', 132719)]
ep: 263 R_agent: -4 agent stat: [('average_q', 18.763385726972956), ('average_loss', 0.8492803060595525), ('n_updates', 133219)]
ep: 264 R_agent: 17 agent stat: [('average_q', 18.805916800895716), ('average_loss', 0.9867707163249694), ('n_updates', 133719)]
ep: 265 R_agent: 17 agent stat: [('average_q', 18.829120737104514), ('average_loss', 0.9654949308873644), ('n_updates', 134219)]
ep: 266 R_agent: 38 agent stat: [('average_q', 18.702959352526673), ('average_loss', 0.9907327135182284), ('n_updates', 134719)]
ep: 267 R_agent: 17 agent stat: [('average_q', 18.64133650793842), ('average_loss', 0.945949373418

ep: 324 R_agent: 17 agent stat: [('average_q', 18.98852626061879), ('average_loss', 0.9276988056599358), ('n_updates', 163719)]
ep: 325 R_agent: 38 agent stat: [('average_q', 18.94371488728106), ('average_loss', 1.0289558963451164), ('n_updates', 164219)]
ep: 326 R_agent: 38 agent stat: [('average_q', 18.820221902383228), ('average_loss', 0.9546535064339324), ('n_updates', 164719)]
ep: 327 R_agent: 59 agent stat: [('average_q', 18.74106164587576), ('average_loss', 0.8581085834785651), ('n_updates', 165219)]
ep: 328 R_agent: 38 agent stat: [('average_q', 18.797366455066424), ('average_loss', 0.9219449143679682), ('n_updates', 165719)]
ep: 329 R_agent: -4 agent stat: [('average_q', 18.87780732654533), ('average_loss', 1.0311608623369062), ('n_updates', 166219)]
ep: 330 R_agent: -4 agent stat: [('average_q', 18.825584299502115), ('average_loss', 1.0334089674240834), ('n_updates', 166719)]
ep: 331 R_agent: -4 agent stat: [('average_q', 18.75928038256968), ('average_loss', 0.993976843712854

ep: 388 R_agent: -4 agent stat: [('average_q', 18.48563319585834), ('average_loss', 0.9501418548706466), ('n_updates', 195719)]
ep: 389 R_agent: -4 agent stat: [('average_q', 18.498843595509097), ('average_loss', 0.9452344147637417), ('n_updates', 196219)]
ep: 390 R_agent: 17 agent stat: [('average_q', 18.454460473401806), ('average_loss', 0.9501747738452144), ('n_updates', 196719)]
ep: 391 R_agent: 38 agent stat: [('average_q', 18.569157546815294), ('average_loss', 0.8964916252079285), ('n_updates', 197219)]
ep: 392 R_agent: -25 agent stat: [('average_q', 18.677783783887012), ('average_loss', 0.9053584053026933), ('n_updates', 197719)]
ep: 393 R_agent: 17 agent stat: [('average_q', 18.64346619781424), ('average_loss', 1.016594744039245), ('n_updates', 198219)]
ep: 394 R_agent: -4 agent stat: [('average_q', 18.605018218430036), ('average_loss', 0.950364458508891), ('n_updates', 198719)]
ep: 395 R_agent: 17 agent stat: [('average_q', 18.582701477852556), ('average_loss', 0.9574003792312

ep: 452 R_agent: -4 agent stat: [('average_q', 18.933294958294358), ('average_loss', 0.9048603171723639), ('n_updates', 227719)]
ep: 453 R_agent: -4 agent stat: [('average_q', 18.879184661702567), ('average_loss', 0.9071758778732558), ('n_updates', 228219)]
ep: 454 R_agent: 38 agent stat: [('average_q', 18.845397186697976), ('average_loss', 0.9247780287567771), ('n_updates', 228719)]
ep: 455 R_agent: 59 agent stat: [('average_q', 18.65367743716377), ('average_loss', 0.8866181077055936), ('n_updates', 229219)]
ep: 456 R_agent: 38 agent stat: [('average_q', 18.7149662481053), ('average_loss', 0.9697690518836943), ('n_updates', 229719)]
ep: 457 R_agent: 17 agent stat: [('average_q', 18.695667736434153), ('average_loss', 1.0486549257338733), ('n_updates', 230219)]
ep: 458 R_agent: -25 agent stat: [('average_q', 18.673714546161154), ('average_loss', 1.0495677193894868), ('n_updates', 230719)]
ep: 459 R_agent: -4 agent stat: [('average_q', 18.64268509146885), ('average_loss', 0.8961152227635

ep: 516 R_agent: 17 agent stat: [('average_q', 18.89866354577019), ('average_loss', 0.9294135418349496), ('n_updates', 259719)]
ep: 517 R_agent: 38 agent stat: [('average_q', 18.830515110440974), ('average_loss', 0.9415078086710919), ('n_updates', 260219)]
ep: 518 R_agent: 17 agent stat: [('average_q', 18.774923660433593), ('average_loss', 0.9305947598401344), ('n_updates', 260719)]
ep: 519 R_agent: -4 agent stat: [('average_q', 18.678899345889764), ('average_loss', 0.903869187131711), ('n_updates', 261219)]
ep: 520 R_agent: -4 agent stat: [('average_q', 18.54074312857525), ('average_loss', 0.8334016517304623), ('n_updates', 261719)]
ep: 521 R_agent: -4 agent stat: [('average_q', 18.49507405688796), ('average_loss', 0.963290702440834), ('n_updates', 262219)]
ep: 522 R_agent: -4 agent stat: [('average_q', 18.553751244867716), ('average_loss', 0.8943612144472498), ('n_updates', 262719)]
ep: 523 R_agent: 17 agent stat: [('average_q', 18.51818020877637), ('average_loss', 1.0107115719406066

ep: 580 R_agent: 17 agent stat: [('average_q', 18.966439733429375), ('average_loss', 0.8779428855324364), ('n_updates', 291719)]
ep: 581 R_agent: -4 agent stat: [('average_q', 19.049539212171073), ('average_loss', 0.8915007388861251), ('n_updates', 292219)]
ep: 582 R_agent: 17 agent stat: [('average_q', 18.96633314669631), ('average_loss', 0.9597380734374639), ('n_updates', 292719)]
ep: 583 R_agent: 17 agent stat: [('average_q', 18.88420495404372), ('average_loss', 1.0252992927993838), ('n_updates', 293219)]
ep: 584 R_agent: -25 agent stat: [('average_q', 18.929660595756825), ('average_loss', 0.7867808222894878), ('n_updates', 293719)]
ep: 585 R_agent: 38 agent stat: [('average_q', 18.895795734456023), ('average_loss', 0.9686831245297801), ('n_updates', 294219)]
ep: 586 R_agent: 17 agent stat: [('average_q', 18.887833747292028), ('average_loss', 0.9859946548577466), ('n_updates', 294719)]
ep: 587 R_agent: 17 agent stat: [('average_q', 18.743235211953085), ('average_loss', 0.90786467897

ep: 644 R_agent: 17 agent stat: [('average_q', 18.746850338724954), ('average_loss', 0.904499385701141), ('n_updates', 323719)]
ep: 645 R_agent: -4 agent stat: [('average_q', 18.7603731309961), ('average_loss', 0.9332408625333946), ('n_updates', 324219)]
ep: 646 R_agent: 38 agent stat: [('average_q', 18.704343770287657), ('average_loss', 0.8362438988074774), ('n_updates', 324719)]
ep: 647 R_agent: 17 agent stat: [('average_q', 18.721975336075314), ('average_loss', 0.9227917438395307), ('n_updates', 325219)]
ep: 648 R_agent: -4 agent stat: [('average_q', 18.695660195808433), ('average_loss', 0.9255221045986118), ('n_updates', 325719)]
ep: 649 R_agent: 38 agent stat: [('average_q', 18.677812764072833), ('average_loss', 0.9573318541232488), ('n_updates', 326219)]
ep: 650 R_agent: -4 agent stat: [('average_q', 18.662814878252068), ('average_loss', 0.8837254770599127), ('n_updates', 326719)]
ep: 651 R_agent: -25 agent stat: [('average_q', 18.690009599243005), ('average_loss', 0.827131333776

ep: 708 R_agent: 59 agent stat: [('average_q', 18.507901451666143), ('average_loss', 0.9230147026558813), ('n_updates', 355719)]
ep: 709 R_agent: 38 agent stat: [('average_q', 18.60189750287703), ('average_loss', 0.9316116117647458), ('n_updates', 356219)]
ep: 710 R_agent: -4 agent stat: [('average_q', 18.734065616636723), ('average_loss', 1.0216513141593344), ('n_updates', 356719)]
ep: 711 R_agent: -4 agent stat: [('average_q', 18.756642946922657), ('average_loss', 1.029045120725181), ('n_updates', 357219)]
ep: 712 R_agent: -4 agent stat: [('average_q', 18.80147300448688), ('average_loss', 0.9984554452718668), ('n_updates', 357719)]
ep: 713 R_agent: 17 agent stat: [('average_q', 18.720664805165868), ('average_loss', 1.011466582899579), ('n_updates', 358219)]
ep: 714 R_agent: -4 agent stat: [('average_q', 18.681092163432126), ('average_loss', 0.86094738238059), ('n_updates', 358719)]
ep: 715 R_agent: -4 agent stat: [('average_q', 18.694538191438074), ('average_loss', 1.0875549743523214

ep: 772 R_agent: 17 agent stat: [('average_q', 18.70727091585639), ('average_loss', 0.9685842796461769), ('n_updates', 387719)]
ep: 773 R_agent: -4 agent stat: [('average_q', 18.680101186014326), ('average_loss', 0.9100038415980428), ('n_updates', 388219)]
ep: 774 R_agent: 17 agent stat: [('average_q', 18.708475102106746), ('average_loss', 0.9474386982640768), ('n_updates', 388719)]
ep: 775 R_agent: 38 agent stat: [('average_q', 18.718287660120236), ('average_loss', 0.9228202335610537), ('n_updates', 389219)]
ep: 776 R_agent: -25 agent stat: [('average_q', 18.67947927355332), ('average_loss', 0.9109922292328879), ('n_updates', 389719)]
ep: 777 R_agent: -4 agent stat: [('average_q', 18.648798674587518), ('average_loss', 0.9038425161917752), ('n_updates', 390219)]
ep: 778 R_agent: 17 agent stat: [('average_q', 18.643415565855918), ('average_loss', 0.9005371666588149), ('n_updates', 390719)]
ep: 779 R_agent: 17 agent stat: [('average_q', 18.647370435000806), ('average_loss', 0.85277330530

ep: 836 R_agent: 17 agent stat: [('average_q', 18.669240531625032), ('average_loss', 0.9152757812309341), ('n_updates', 419719)]
ep: 837 R_agent: -25 agent stat: [('average_q', 18.74548395670417), ('average_loss', 1.0118843725204685), ('n_updates', 420219)]
ep: 838 R_agent: -4 agent stat: [('average_q', 18.72307562406279), ('average_loss', 0.9400338185983893), ('n_updates', 420719)]
ep: 839 R_agent: 17 agent stat: [('average_q', 18.743218367143406), ('average_loss', 0.8894330501924035), ('n_updates', 421219)]
ep: 840 R_agent: 38 agent stat: [('average_q', 18.69478889723362), ('average_loss', 0.9876459093417733), ('n_updates', 421719)]
ep: 841 R_agent: -4 agent stat: [('average_q', 18.58782836966506), ('average_loss', 1.017587588060496), ('n_updates', 422219)]
ep: 842 R_agent: 17 agent stat: [('average_q', 18.600370225193334), ('average_loss', 0.952999027508078), ('n_updates', 422719)]
ep: 843 R_agent: 38 agent stat: [('average_q', 18.560401013366896), ('average_loss', 0.908771449354169

ep: 900 R_agent: 38 agent stat: [('average_q', 18.688966589769123), ('average_loss', 0.9163940038176889), ('n_updates', 451719)]
ep: 901 R_agent: 38 agent stat: [('average_q', 18.7341923087462), ('average_loss', 0.9604204383225959), ('n_updates', 452219)]
ep: 902 R_agent: -25 agent stat: [('average_q', 18.707955911019734), ('average_loss', 1.0044077118665087), ('n_updates', 452719)]
ep: 903 R_agent: -4 agent stat: [('average_q', 18.694219343960476), ('average_loss', 0.9220383233516464), ('n_updates', 453219)]
ep: 904 R_agent: 59 agent stat: [('average_q', 18.499834557473328), ('average_loss', 1.034950552868072), ('n_updates', 453719)]
ep: 905 R_agent: 17 agent stat: [('average_q', 18.526772612376316), ('average_loss', 0.9625836733018999), ('n_updates', 454219)]
ep: 906 R_agent: 17 agent stat: [('average_q', 18.49272275271033), ('average_loss', 0.9120844419007264), ('n_updates', 454719)]
ep: 907 R_agent: -4 agent stat: [('average_q', 18.64562195318232), ('average_loss', 0.94828780308027

ep: 964 R_agent: 17 agent stat: [('average_q', 18.699932471418794), ('average_loss', 0.8916432069401838), ('n_updates', 483719)]
ep: 965 R_agent: -25 agent stat: [('average_q', 18.713755362770474), ('average_loss', 0.958713907193493), ('n_updates', 484219)]
ep: 966 R_agent: -25 agent stat: [('average_q', 18.78537806820548), ('average_loss', 0.8958272551851731), ('n_updates', 484719)]
ep: 967 R_agent: 17 agent stat: [('average_q', 18.72467140157024), ('average_loss', 1.0228178876256018), ('n_updates', 485219)]
ep: 968 R_agent: 38 agent stat: [('average_q', 18.680221996163777), ('average_loss', 0.8673153678208014), ('n_updates', 485719)]
ep: 969 R_agent: -4 agent stat: [('average_q', 18.688144084116054), ('average_loss', 1.0326051977646944), ('n_updates', 486219)]
ep: 970 R_agent: 17 agent stat: [('average_q', 18.590413396635952), ('average_loss', 1.012738948111056), ('n_updates', 486719)]
ep: 971 R_agent: 59 agent stat: [('average_q', 18.694623918896596), ('average_loss', 1.014631510267

In [451]:
chosen_one

<chainerrl.agents.double_dqn.DoubleDQN at 0x1ce7be1c488>

In [452]:
chosen_one.save("chosen_one_model")

In [453]:
generator.save("generator_model")