# Reinforcement Learning Space Invaders

In [1]:
#Imports
import numpy as np
import os
import gym
from gym import error, spaces
from gym import utils
from gym.utils import seeding


try:
    import atari_py
except ImportError as e:
    raise error.DependencyNotInstalled("{}. (HINT: you can install Atari dependencies by running 'pip install gym[atari]'.)".format(e))

In [2]:
import random
def eps_greedy_policy(q_values, eps):
    '''
    Creates an epsilon-greedy policy
    :param q_values: set of Q-values of shape (num actions,)
    :param eps: probability of taking a uniform random action 
    :return: policy of shape (num actions,)
    '''

    policy = np.zeros_like(q_values)
    r = random.uniform(0,1)
    
    if(r < eps):
        policy[:] = 1./len(q_values)
    else:
        greedy_action = np.argmax(q_values)
        policy[greedy_action] = 1.
    
    return policy

def calculate_td_targets(q1_batch, q2_batch, r_batch, t_batch, gamma=.99):
    '''
    Calculates the TD-target used for the loss
    : param q1_batch: Batch of Q(s', a) from online network, shape (N, num actions)
    : param q2_batch: Batch of Q(s', a) from target network, shape (N, num actions)
    : param r_batch: Batch of rewards, shape (N, 1)
    : param t_batch: Batch of booleans indicating if state, s' is terminal, shape (N, 1)
    : return: TD-target, shape (N, 1)
    '''

    # YOUR CODE HERE
    Y = np.zeros_like(r_batch)
    
    for i,(q1, q2, r, t) in enumerate(zip(q1_batch, q2_batch, r_batch, t_batch)):
        Y[i] = r
        if not t: Y[i] += gamma * q2[np.argmax(q1)]
        
    return Y


In [3]:
def to_ram(ale):
    ram_size = ale.getRAMSize()
    ram = np.zeros((ram_size),dtype=np.uint8)
    ale.getRAM(ram)
    return ram

In [12]:
import numpy as np
import gym
from keras.utils.np_utils import to_categorical as one_hot
from collections import namedtuple
from dqn_model import DoubleQLearningModel, ExperienceReplay

def train_loop_ddqn(model, env, num_episodes, batch_size=64, gamma=.94):        
    Transition = namedtuple("Transition", ["s", "a", "r", "next_s", "t"])
    eps = 1.
    eps_end = .1 
    eps_decay = .0005
    R_buffer = []
    R_avg = []
    for i in range(num_episodes):
        state = env.reset() #reset to initial state
        state = np.expand_dims(state, axis=0)/2
        terminal = False # reset terminal flag
        ep_reward = 0
        q_buffer = []
        steps = 0
        while not terminal:
            env.render() # comment this line out if jous' don't want to render the environment
            steps += 1
            q_values = model.get_q_values(state)
            q_buffer.append(q_values)
            policy = eps_greedy_policy(q_values.squeeze(), eps) 
            action = np.random.choice(num_actions, p=policy) # sample action from epsilon-greedy policy
            new_state, reward, terminal, _ = env.step(action) # take one step in the evironment
            new_state = np.expand_dims(new_state, axis=0)/2
            
            # only use the terminal flag for ending the episode and not for training
            # if the flag is set due to that the maximum amount of steps is reached 
            t_to_buffer = terminal if not steps == 200 else False
            
            # store data to replay buffer
            replay_buffer.add(Transition(s=state, a=action, r=reward, next_s=new_state, t=t_to_buffer))
            state = new_state
            ep_reward += reward
            
            # if buffer contains more than 1000 samples, perform one training step
            if replay_buffer.buffer_length > 1000:
                s, a, r, s_, t = replay_buffer.sample_minibatch(batch_size) # sample a minibatch of transitions
                q_1, q_2 = model.get_q_values_for_both_models(np.squeeze(s_))
                td_target = calculate_td_targets(q_1, q_2, r, t, gamma)
                model.update(s, td_target, a)    
                
        eps = max(eps - eps_decay, eps_end) # decrease epsilon        
        R_buffer.append(ep_reward)
        
        # running average of episodic rewards
        R_avg.append(.05 * R_buffer[i] + .95 * R_avg[i-1]) if i > 0 else R_avg.append(R_buffer[i])
        print('Episode: ', i, 'Reward:', ep_reward, 'Epsilon', eps, 'mean q', np.mean(np.array(q_buffer)))
        
        # if running average > 195, the task is considerd solved
    return R_buffer, R_avg

In [13]:
class AtariEnv(gym.Env, utils.EzPickle):
    metadata = {'render.modes': ['human', 'rgb_array']}

    def __init__(self, game='SpaceInvaders-v0', obs_type='ram', frameskip=(2, 5), repeat_action_probability=0.):
        """Frameskip should be either a tuple (indicating a random range to
        choose from, with the top value exclude), or an int."""

        utils.EzPickle.__init__(self, game, obs_type, frameskip, repeat_action_probability)
        assert obs_type in ('ram', 'image')

        self.game_path = atari_py.get_game_path(game)
        if not os.path.exists(self.game_path):
            raise IOError('You asked for game %s but path %s does not exist'%(game, self.game_path))
        self._obs_type = obs_type
        self.frameskip = frameskip
        self.ale = atari_py.ALEInterface()
        self.viewer = None

        # Tune (or disable) ALE's action repeat:
        # https://github.com/openai/gym/issues/349
        assert isinstance(repeat_action_probability, (float, int)), "Invalid repeat_action_probability: {!r}".format(repeat_action_probability)
        self.ale.setFloat('repeat_action_probability'.encode('utf-8'), repeat_action_probability)

        self.seed()

        self._action_set = self.ale.getMinimalActionSet()
        self.action_space = spaces.Discrete(len(self._action_set))

        (screen_width,screen_height) = self.ale.getScreenDims()
        if self._obs_type == 'ram':
            self.observation_space = spaces.Box(low=0, high=255, dtype=np.uint8, shape=(128,))
        elif self._obs_type == 'image':
            self.observation_space = spaces.Box(low=0, high=255, shape=(screen_height, screen_width, 3), dtype=np.uint8)
        else:
            raise error.Error('Unrecognized observation type: {}'.format(self._obs_type))

    def seed(self, seed=None):
        self.np_random, seed1 = seeding.np_random(seed)
        # Derive a random seed. This gets passed as a uint, but gets
        # checked as an int elsewhere, so we need to keep it below
        # 2**31.
        seed2 = seeding.hash_seed(seed1 + 1) % 2**31
        # Empirically, we need to seed before loading the ROM.
        self.ale.setInt(b'random_seed', seed2)
        self.ale.loadROM(self.game_path)
        return [seed1, seed2]

    def step(self, a):
        reward = 0.0
        action = self._action_set[a]

        if isinstance(self.frameskip, int):
            num_steps = self.frameskip
        else:
            num_steps = self.np_random.randint(self.frameskip[0], self.frameskip[1])
        for _ in range(num_steps):
            reward += self.ale.act(action)
        ob = self._get_obs()

        return ob, reward, self.ale.game_over(), {"ale.lives": self.ale.lives()}

    def _get_image(self):
        return self.ale.getScreenRGB2()

    def _get_ram(self):
        return to_ram(self.ale)

    @property
    def _n_actions(self):
        return len(self._action_set)

    def _get_obs(self):
        if self._obs_type == 'ram':
            return self._get_ram()
        elif self._obs_type == 'image':
            img = self._get_image()
        return img

    # return: (states, observations)
    def reset(self):
        self.ale.reset_game()
        return self._get_obs()

    def render(self, mode='human'):
        img = self._get_image()
        if mode == 'rgb_array':
            return img
        elif mode == 'human':
            from gym.envs.classic_control import rendering
            if self.viewer is None:
                self.viewer = rendering.SimpleImageViewer()
            self.viewer.imshow(img)
            return self.viewer.isopen

    def close(self):
        if self.viewer is not None:
            self.viewer.close()
            self.viewer = None

    def get_action_meanings(self):
        return [ACTION_MEANING[i] for i in self._action_set]

    def get_keys_to_action(self):
        KEYWORD_TO_KEY = {
            'UP':      ord('w'),
            'DOWN':    ord('s'),
            'LEFT':    ord('a'),
            'RIGHT':   ord('d'),
            'FIRE':    ord(' '),
        }

        keys_to_action = {}

        for action_id, action_meaning in enumerate(self.get_action_meanings()):
            keys = []
            for keyword, key in KEYWORD_TO_KEY.items():
                if keyword in action_meaning:
                    keys.append(key)
            keys = tuple(sorted(keys))

            assert keys not in keys_to_action
            keys_to_action[keys] = action_id

        return keys_to_action

    def clone_state(self):
        """Clone emulator state w/o system state. Restoring this state will
        *not* give an identical environment. For complete cloning and restoring
        of the full state, see `{clone,restore}_full_state()`."""
        state_ref = self.ale.cloneState()
        state = self.ale.encodeState(state_ref)
        self.ale.deleteState(state_ref)
        return state

    def restore_state(self, state):
        """Restore emulator state w/o system state."""
        state_ref = self.ale.decodeState(state)
        self.ale.restoreState(state_ref)
        self.ale.deleteState(state_ref)

    def clone_full_state(self):
        """Clone emulator state w/ system state including pseudorandomness.
        Restoring this state will give an identical environment."""
        state_ref = self.ale.cloneSystemState()
        state = self.ale.encodeState(state_ref)
        self.ale.deleteState(state_ref)
        return state

    def restore_full_state(self, state):
        """Restore emulator state w/ system state including pseudorandomness."""
        state_ref = self.ale.decodeState(state)
        self.ale.restoreSystemState(state_ref)
        self.ale.deleteState(state_ref)

ACTION_MEANING = {
    0 : "NOOP",
    1 : "FIRE",
    2 : "UP",
    3 : "RIGHT",
    4 : "LEFT",
    5 : "DOWN",
    6 : "UPRIGHT",
    7 : "UPLEFT",
    8 : "DOWNRIGHT",
    9 : "DOWNLEFT",
    10 : "UPFIRE",
    11 : "RIGHTFIRE",
    12 : "LEFTFIRE",
    13 : "DOWNFIRE",
    14 : "UPRIGHTFIRE",
    15 : "UPLEFTFIRE",
    16 : "DOWNRIGHTFIRE",
    17 : "DOWNLEFTFIRE",
}

In [14]:
# Create the environment
env = gym.make("SpaceInvaders-ram-v0")

# Initializations
num_actions = env.action_space.n
obs_dim = env.observation_space.shape[0]

# Our Neural Netork model used to estimate the Q-values
model = DoubleQLearningModel(state_dim=obs_dim, action_dim=num_actions, learning_rate=1e-4)

# Create replay buffer, where experience in form of tuples <s,a,r,s',t>, gathered from the environment is stored 
# for training
replay_buffer = ExperienceReplay(state_size=obs_dim)

# Train
num_episodes = 2000 
batch_size = 128 
R, R_avg = train_loop_ddqn(model, env, num_episodes, batch_size) 

Episode:  0 Reward: 75.0 Epsilon 0.9995 mean q -1.635741e-05
Episode:  1 Reward: 105.0 Epsilon 0.9990000000000001 mean q 0.019506026
Episode:  2 Reward: 50.0 Epsilon 0.9985000000000002 mean q 3.3235056
Episode:  3 Reward: 30.0 Epsilon 0.9980000000000002 mean q 5.3060365
Episode:  4 Reward: 75.0 Epsilon 0.9975000000000003 mean q 4.168429
Episode:  5 Reward: 135.0 Epsilon 0.9970000000000003 mean q 5.3198485
Episode:  6 Reward: 90.0 Epsilon 0.9965000000000004 mean q 6.465825
Episode:  7 Reward: 105.0 Epsilon 0.9960000000000004 mean q 5.1163793
Episode:  8 Reward: 210.0 Epsilon 0.9955000000000005 mean q 6.548742
Episode:  9 Reward: 105.0 Epsilon 0.9950000000000006 mean q 4.9635496
Episode:  10 Reward: 135.0 Epsilon 0.9945000000000006 mean q 2.7467902
Episode:  11 Reward: 110.0 Epsilon 0.9940000000000007 mean q 3.3210301
Episode:  12 Reward: 245.0 Epsilon 0.9935000000000007 mean q 6.7373
Episode:  13 Reward: 105.0 Epsilon 0.9930000000000008 mean q 4.789228
Episode:  14 Reward: 45.0 Epsilon 

Episode:  117 Reward: 90.0 Epsilon 0.9410000000000065 mean q 4.2474623
Episode:  118 Reward: 45.0 Epsilon 0.9405000000000066 mean q 2.695311
Episode:  119 Reward: 80.0 Epsilon 0.9400000000000066 mean q 2.003827
Episode:  120 Reward: 270.0 Epsilon 0.9395000000000067 mean q 4.224215
Episode:  121 Reward: 180.0 Epsilon 0.9390000000000067 mean q 4.53043
Episode:  122 Reward: 110.0 Epsilon 0.9385000000000068 mean q 3.9871812
Episode:  123 Reward: 90.0 Epsilon 0.9380000000000068 mean q 3.9654565
Episode:  124 Reward: 180.0 Epsilon 0.9375000000000069 mean q 4.154578
Episode:  125 Reward: 60.0 Epsilon 0.9370000000000069 mean q 2.8047628
Episode:  126 Reward: 60.0 Epsilon 0.936500000000007 mean q 3.8609192
Episode:  127 Reward: 180.0 Epsilon 0.936000000000007 mean q 4.38183
Episode:  128 Reward: 150.0 Epsilon 0.9355000000000071 mean q 4.206466
Episode:  129 Reward: 205.0 Epsilon 0.9350000000000072 mean q 3.0707884
Episode:  130 Reward: 140.0 Epsilon 0.9345000000000072 mean q 3.8382564
Episode: 

Episode:  233 Reward: 80.0 Epsilon 0.8830000000000129 mean q 2.3577392
Episode:  234 Reward: 235.0 Epsilon 0.8825000000000129 mean q 3.1047587
Episode:  235 Reward: 230.0 Epsilon 0.882000000000013 mean q 3.5488503
Episode:  236 Reward: 210.0 Epsilon 0.881500000000013 mean q 3.3005116
Episode:  237 Reward: 110.0 Epsilon 0.8810000000000131 mean q 2.5167751
Episode:  238 Reward: 385.0 Epsilon 0.8805000000000132 mean q 3.2019997
Episode:  239 Reward: 45.0 Epsilon 0.8800000000000132 mean q 2.9402237
Episode:  240 Reward: 215.0 Epsilon 0.8795000000000133 mean q 4.415627
Episode:  241 Reward: 110.0 Epsilon 0.8790000000000133 mean q 4.097623
Episode:  242 Reward: 110.0 Epsilon 0.8785000000000134 mean q 2.6386933
Episode:  243 Reward: 135.0 Epsilon 0.8780000000000134 mean q 2.78817
Episode:  244 Reward: 100.0 Epsilon 0.8775000000000135 mean q 3.400431
Episode:  245 Reward: 215.0 Epsilon 0.8770000000000135 mean q 2.5281608
Episode:  246 Reward: 55.0 Epsilon 0.8765000000000136 mean q 2.5550075
Ep

Episode:  348 Reward: 160.0 Epsilon 0.8255000000000192 mean q 3.9559734
Episode:  349 Reward: 185.0 Epsilon 0.8250000000000193 mean q 2.2612488
Episode:  350 Reward: 160.0 Epsilon 0.8245000000000193 mean q 3.6409025
Episode:  351 Reward: 210.0 Epsilon 0.8240000000000194 mean q 2.58619
Episode:  352 Reward: 210.0 Epsilon 0.8235000000000194 mean q 3.15424
Episode:  353 Reward: 165.0 Epsilon 0.8230000000000195 mean q 3.463511
Episode:  354 Reward: 155.0 Epsilon 0.8225000000000195 mean q 3.3753948
Episode:  355 Reward: 105.0 Epsilon 0.8220000000000196 mean q 3.0148857
Episode:  356 Reward: 70.0 Epsilon 0.8215000000000197 mean q 3.3705156
Episode:  357 Reward: 135.0 Epsilon 0.8210000000000197 mean q 3.0448925
Episode:  358 Reward: 30.0 Epsilon 0.8205000000000198 mean q 2.9817352
Episode:  359 Reward: 110.0 Epsilon 0.8200000000000198 mean q 3.6917713
Episode:  360 Reward: 90.0 Epsilon 0.8195000000000199 mean q 3.3374076
Episode:  361 Reward: 410.0 Epsilon 0.8190000000000199 mean q 3.3947449


Episode:  463 Reward: 155.0 Epsilon 0.7680000000000256 mean q 3.2587206
Episode:  464 Reward: 90.0 Epsilon 0.7675000000000256 mean q 4.3621726
Episode:  465 Reward: 125.0 Epsilon 0.7670000000000257 mean q 3.1367328
Episode:  466 Reward: 185.0 Epsilon 0.7665000000000257 mean q 3.6708233
Episode:  467 Reward: 110.0 Epsilon 0.7660000000000258 mean q 2.48805
Episode:  468 Reward: 50.0 Epsilon 0.7655000000000258 mean q 2.2021458
Episode:  469 Reward: 180.0 Epsilon 0.7650000000000259 mean q 3.497818
Episode:  470 Reward: 70.0 Epsilon 0.7645000000000259 mean q 2.4310052
Episode:  471 Reward: 135.0 Epsilon 0.764000000000026 mean q 3.748805
Episode:  472 Reward: 155.0 Epsilon 0.763500000000026 mean q 2.9159439
Episode:  473 Reward: 200.0 Epsilon 0.7630000000000261 mean q 3.5930617
Episode:  474 Reward: 155.0 Epsilon 0.7625000000000262 mean q 4.217102
Episode:  475 Reward: 240.0 Epsilon 0.7620000000000262 mean q 3.2368746
Episode:  476 Reward: 180.0 Epsilon 0.7615000000000263 mean q 3.859058
Epi

Episode:  578 Reward: 105.0 Epsilon 0.7105000000000319 mean q 2.8126466
Episode:  579 Reward: 115.0 Epsilon 0.7100000000000319 mean q 2.6591527
Episode:  580 Reward: 80.0 Epsilon 0.709500000000032 mean q 3.116161
Episode:  581 Reward: 45.0 Epsilon 0.709000000000032 mean q 3.2482011
Episode:  582 Reward: 180.0 Epsilon 0.7085000000000321 mean q 2.4185548
Episode:  583 Reward: 215.0 Epsilon 0.7080000000000322 mean q 3.5559301
Episode:  584 Reward: 30.0 Epsilon 0.7075000000000322 mean q 2.0250971
Episode:  585 Reward: 335.0 Epsilon 0.7070000000000323 mean q 3.6653533
Episode:  586 Reward: 160.0 Epsilon 0.7065000000000323 mean q 3.8574061
Episode:  587 Reward: 415.0 Epsilon 0.7060000000000324 mean q 4.310044
Episode:  588 Reward: 125.0 Epsilon 0.7055000000000324 mean q 3.2323015
Episode:  589 Reward: 125.0 Epsilon 0.7050000000000325 mean q 2.723282
Episode:  590 Reward: 15.0 Epsilon 0.7045000000000325 mean q 2.293179
Episode:  591 Reward: 210.0 Epsilon 0.7040000000000326 mean q 3.8272307
Ep

Episode:  693 Reward: 135.0 Epsilon 0.6530000000000382 mean q 3.3731885
Episode:  694 Reward: 155.0 Epsilon 0.6525000000000383 mean q 4.377128
Episode:  695 Reward: 210.0 Epsilon 0.6520000000000383 mean q 4.111068
Episode:  696 Reward: 125.0 Epsilon 0.6515000000000384 mean q 3.848222
Episode:  697 Reward: 185.0 Epsilon 0.6510000000000384 mean q 3.7114022
Episode:  698 Reward: 90.0 Epsilon 0.6505000000000385 mean q 2.6845212
Episode:  699 Reward: 110.0 Epsilon 0.6500000000000385 mean q 2.5768666
Episode:  700 Reward: 20.0 Epsilon 0.6495000000000386 mean q 2.3819883
Episode:  701 Reward: 100.0 Epsilon 0.6490000000000387 mean q 3.290349
Episode:  702 Reward: 135.0 Epsilon 0.6485000000000387 mean q 3.757516
Episode:  703 Reward: 330.0 Epsilon 0.6480000000000388 mean q 3.9556198
Episode:  704 Reward: 215.0 Epsilon 0.6475000000000388 mean q 3.8458486
Episode:  705 Reward: 105.0 Epsilon 0.6470000000000389 mean q 2.9117167
Episode:  706 Reward: 90.0 Epsilon 0.6465000000000389 mean q 2.3599963


Episode:  808 Reward: 35.0 Epsilon 0.5955000000000445 mean q 2.5760257
Episode:  809 Reward: 105.0 Epsilon 0.5950000000000446 mean q 4.2481027
Episode:  810 Reward: 180.0 Epsilon 0.5945000000000447 mean q 3.5718029
Episode:  811 Reward: 215.0 Epsilon 0.5940000000000447 mean q 3.6053114
Episode:  812 Reward: 180.0 Epsilon 0.5935000000000448 mean q 4.169698
Episode:  813 Reward: 55.0 Epsilon 0.5930000000000448 mean q 3.6819904
Episode:  814 Reward: 155.0 Epsilon 0.5925000000000449 mean q 3.5639427
Episode:  815 Reward: 120.0 Epsilon 0.5920000000000449 mean q 3.6973443
Episode:  816 Reward: 185.0 Epsilon 0.591500000000045 mean q 3.1262393
Episode:  817 Reward: 80.0 Epsilon 0.591000000000045 mean q 3.593118
Episode:  818 Reward: 160.0 Epsilon 0.5905000000000451 mean q 3.205404
Episode:  819 Reward: 345.0 Epsilon 0.5900000000000452 mean q 3.52454
Episode:  820 Reward: 105.0 Epsilon 0.5895000000000452 mean q 3.1293373
Episode:  821 Reward: 155.0 Epsilon 0.5890000000000453 mean q 3.6511033
Ep

Episode:  923 Reward: 120.0 Epsilon 0.5380000000000509 mean q 4.1033115
Episode:  924 Reward: 410.0 Epsilon 0.5375000000000509 mean q 4.1809964
Episode:  925 Reward: 215.0 Epsilon 0.537000000000051 mean q 4.3713913
Episode:  926 Reward: 285.0 Epsilon 0.536500000000051 mean q 3.976078
Episode:  927 Reward: 40.0 Epsilon 0.5360000000000511 mean q 2.7672377
Episode:  928 Reward: 230.0 Epsilon 0.5355000000000512 mean q 3.872207
Episode:  929 Reward: 490.0 Epsilon 0.5350000000000512 mean q 4.1413636
Episode:  930 Reward: 180.0 Epsilon 0.5345000000000513 mean q 4.268289
Episode:  931 Reward: 130.0 Epsilon 0.5340000000000513 mean q 3.6997678
Episode:  932 Reward: 20.0 Epsilon 0.5335000000000514 mean q 2.177476
Episode:  933 Reward: 240.0 Epsilon 0.5330000000000514 mean q 2.2132342
Episode:  934 Reward: 120.0 Epsilon 0.5325000000000515 mean q 2.2878726
Episode:  935 Reward: 150.0 Epsilon 0.5320000000000515 mean q 3.4334815
Episode:  936 Reward: 130.0 Epsilon 0.5315000000000516 mean q 3.327064
E

Episode:  1037 Reward: 80.0 Epsilon 0.48100000000005505 mean q 3.6107576
Episode:  1038 Reward: 65.0 Epsilon 0.48050000000005505 mean q 2.6536047
Episode:  1039 Reward: 260.0 Epsilon 0.48000000000005505 mean q 4.2821326
Episode:  1040 Reward: 160.0 Epsilon 0.47950000000005505 mean q 3.8079026
Episode:  1041 Reward: 180.0 Epsilon 0.47900000000005505 mean q 3.0490625
Episode:  1042 Reward: 215.0 Epsilon 0.47850000000005505 mean q 2.6942132
Episode:  1043 Reward: 35.0 Epsilon 0.47800000000005505 mean q 3.1254897
Episode:  1044 Reward: 110.0 Epsilon 0.47750000000005505 mean q 2.9949934
Episode:  1045 Reward: 180.0 Epsilon 0.47700000000005505 mean q 3.6049907
Episode:  1046 Reward: 120.0 Epsilon 0.47650000000005505 mean q 3.7180412
Episode:  1047 Reward: 410.0 Epsilon 0.47600000000005505 mean q 3.4990737
Episode:  1048 Reward: 35.0 Epsilon 0.47550000000005505 mean q 2.4229805
Episode:  1049 Reward: 150.0 Epsilon 0.47500000000005504 mean q 3.314945
Episode:  1050 Reward: 135.0 Epsilon 0.4745

KeyboardInterrupt: 

In [11]:
env.close();

In [None]:
import time
num_episodes = 1
env = gym.make("SpaceInvaders-ram-v0")

for i in range(num_episodes):
        state = env.reset() #reset to initial state
        state = np.expand_dims(state, axis=0)/2
        terminal = False # reset terminal flag
        while not terminal:
            env.render()
            time.sleep(.05)
            q_values = model.get_q_values(state)
            policy = eps_greedy_policy(q_values.squeeze(), .1) # greedy policy
            action = np.random.choice(num_actions, p=policy)
            state, reward, terminal, _ = env.step(action) # take one step in the evironment
            state = np.expand_dims(state, axis=0)/2
# close window
env.close();

In [1]:
# OpenGym CartPole-v0 with A3C on GPU
# -----------------------------------
#
# A3C implementation with GPU optimizer threads.
# 
# Made as part of blog series Let's make an A3C, available at
# https://jaromiru.com/2017/02/16/lets-make-an-a3c-theory/
#
# author: Jaromir Janisch, 2017

import numpy as np
import tensorflow as tf

import gym, time, random, threading

from keras.models import *
from keras.layers import *
from keras import backend as K

#-- constants
ENV = 'SpaceInvaders-ram-v0'

RUN_TIME = 30
THREADS = 8
OPTIMIZERS = 2
THREAD_DELAY = 0.001

GAMMA = 0.99

N_STEP_RETURN = 8
GAMMA_N = GAMMA ** N_STEP_RETURN

EPS_START = 0.4
EPS_STOP  = .15
EPS_STEPS = 75000

MIN_BATCH = 32
LEARNING_RATE = 5e-3

LOSS_V = .5			# v loss coefficient
LOSS_ENTROPY = .01 	# entropy coefficient

#---------
class Brain:
    train_queue = [ [], [], [], [], [] ]	# s, a, r, s', s' terminal mask
    lock_queue = threading.Lock()

    def __init__(self):
        self.session = tf.Session()
        K.set_session(self.session)
        K.manual_variable_initialization(True)

        self.model = self._build_model()
        self.graph = self._build_graph(self.model)

        self.session.run(tf.global_variables_initializer())
        self.default_graph = tf.get_default_graph()

        self.default_graph.finalize()	# avoid modifications

    def _build_model(self):

        l_input = Input( batch_shape=(None, NUM_STATE) )
        l_dense = Dense(16, activation='relu')(l_input)

        out_actions = Dense(NUM_ACTIONS, activation='softmax')(l_dense)
        out_value   = Dense(1, activation='linear')(l_dense)

        model = Model(inputs=[l_input], outputs=[out_actions, out_value])
        model._make_predict_function()	# have to initialize before threading

        return model

    def _build_graph(self, model):
        s_t = tf.placeholder(tf.float32, shape=(None, NUM_STATE))
        a_t = tf.placeholder(tf.float32, shape=(None, NUM_ACTIONS))
        r_t = tf.placeholder(tf.float32, shape=(None, 1)) # not immediate, but discounted n step reward
        
        p, v = model(s_t)

        log_prob = tf.log( tf.reduce_sum(p * a_t, axis=1, keep_dims=True) + 1e-10)
        advantage = r_t - v

        loss_policy = - log_prob * tf.stop_gradient(advantage)									# maximize policy
        loss_value  = LOSS_V * tf.square(advantage)												# minimize value error
        entropy = LOSS_ENTROPY * tf.reduce_sum(p * tf.log(p + 1e-10), axis=1, keep_dims=True)	# maximize entropy (regularization)

        loss_total = tf.reduce_mean(loss_policy + loss_value + entropy)

        optimizer = tf.train.RMSPropOptimizer(LEARNING_RATE, decay=.99)
        minimize = optimizer.minimize(loss_total)

        return s_t, a_t, r_t, minimize

    def optimize(self):
        if len(self.train_queue[0]) < MIN_BATCH:
            time.sleep(0)	# yield
            return

        with self.lock_queue:
            if len(self.train_queue[0]) < MIN_BATCH:	# more thread could have passed without lock
                return 									# we can't yield inside lock

            s, a, r, s_, s_mask = self.train_queue
            self.train_queue = [ [], [], [], [], [] ]

        s = np.vstack(s)
        a = np.vstack(a)
        r = np.vstack(r)
        s_ = np.vstack(s_)
        s_mask = np.vstack(s_mask)

        if len(s) > 5*MIN_BATCH: print("Optimizer alert! Minimizing batch of %d" % len(s))

        v = self.predict_v(s_)
        r = r + GAMMA_N * v * s_mask	# set v to 0 where s_ is terminal state
        
        s_t, a_t, r_t, minimize = self.graph
        self.session.run(minimize, feed_dict={s_t: s, a_t: a, r_t: r})

    def train_push(self, s, a, r, s_):
        with self.lock_queue:
            self.train_queue[0].append(s)
            self.train_queue[1].append(a)
            self.train_queue[2].append(r)

            if s_ is None:
                self.train_queue[3].append(NONE_STATE)
                self.train_queue[4].append(0.)
            else:	
                self.train_queue[3].append(s_)
                self.train_queue[4].append(1.)

    def predict(self, s):
        with self.default_graph.as_default():
            p, v = self.model.predict(s)
            return p, v

    def predict_p(self, s):
        with self.default_graph.as_default():
            p, v = self.model.predict(s)		
            return p

    def predict_v(self, s):
        with self.default_graph.as_default():
            p, v = self.model.predict(s)		
            return v

#---------
frames = 0
class Agent:
    def __init__(self, eps_start, eps_end, eps_steps):
        self.eps_start = eps_start
        self.eps_end   = eps_end
        self.eps_steps = eps_steps

        self.memory = []	# used for n_step return
        self.R = 0.

    def getEpsilon(self):
        if(frames >= self.eps_steps):
            return self.eps_end
        else:
            return self.eps_start + frames * (self.eps_end - self.eps_start) / self.eps_steps	# linearly interpolate

    def act(self, s):
        eps = self.getEpsilon()			
        global frames; frames = frames + 1

        if random.random() < eps:
            return random.randint(0, NUM_ACTIONS-1)

        else:
            s = np.array([s])
            p = brain.predict_p(s)[0]

            # a = np.argmax(p)
            a = np.random.choice(NUM_ACTIONS, p=p)

            return a
    
    def train(self, s, a, r, s_):
        def get_sample(memory, n):
            s, a, _, _  = memory[0]
            _, _, _, s_ = memory[n-1]

            return s, a, self.R, s_

        a_cats = np.zeros(NUM_ACTIONS)	# turn action into one-hot representation
        a_cats[a] = 1 

        self.memory.append( (s, a_cats, r, s_) )

        self.R = ( self.R + r * GAMMA_N ) / GAMMA

        if s_ is None:
            while len(self.memory) > 0:
                n = len(self.memory)
                s, a, r, s_ = get_sample(self.memory, n)
                brain.train_push(s, a, r, s_)

                self.R = ( self.R - self.memory[0][2] ) / GAMMA
                self.memory.pop(0)		

            self.R = 0

        if len(self.memory) >= N_STEP_RETURN:
            s, a, r, s_ = get_sample(self.memory, N_STEP_RETURN)
            brain.train_push(s, a, r, s_)

            self.R = self.R - self.memory[0][2]
            self.memory.pop(0)	

    # possible edge case - if an episode ends in <N steps, the computation is incorrect

#---------
class Environment(threading.Thread):
    stop_signal = False

    def __init__(self, render=False, eps_start=EPS_START, eps_end=EPS_STOP, eps_steps=EPS_STEPS):
        threading.Thread.__init__(self)

        self.render = render
        self.env = gym.make(ENV)
        self.agent = Agent(eps_start, eps_end, eps_steps)

    def runEpisode(self):
        s = self.env.reset()

        R = 0
        while True:         
            time.sleep(THREAD_DELAY) # yield 

            if self.render: self.env.render()

            a = self.agent.act(s)
            s_, r, done, info = self.env.step(a)

            if done: # terminal state
                s_ = None

            self.agent.train(s, a, r, s_)

            s = s_
            R += r

            if done or self.stop_signal:
                break

        print("Total R:", R)

    def run(self):
        while not self.stop_signal:
            self.runEpisode()

    def stop(self):
        self.stop_signal = True

#---------
class Optimizer(threading.Thread):
    stop_signal = False

    def __init__(self):
        threading.Thread.__init__(self)

    def run(self):
        while not self.stop_signal:
            brain.optimize()

    def stop(self):
        self.stop_signal = True

#-- main
env_test = Environment(render=True, eps_start=0., eps_end=0.)
NUM_STATE = env_test.env.observation_space.shape[0]
NUM_ACTIONS = env_test.env.action_space.n
NONE_STATE = np.zeros(NUM_STATE)

brain = Brain()	# brain is global in A3C

envs = [Environment() for i in range(THREADS)]
opts = [Optimizer() for i in range(OPTIMIZERS)]

for o in opts:
    o.start()

for e in envs:
    e.start()

for i in range(0, RUN_TIME):
    time.sleep(1)
    print("Elapsed time(s): ", i)

for e in envs:
    e.stop()
for e in envs:
    e.join()

for o in opts:
    o.stop()
for o in opts:
    o.join()

print("Training finished")
env_test.run()

Using TensorFlow backend.


Instructions for updating:
keep_dims is deprecated, use keepdims instead
Elapsed time(s):  0
Elapsed time(s):  1
Elapsed time(s):  2
Elapsed time(s):  3
Elapsed time(s):  4
Elapsed time(s):  5
Total R: 50.0
Elapsed time(s):  6
Elapsed time(s):  7
Elapsed time(s):  8
Total R: 50.0
Elapsed time(s):  9
Total R: 50.0
Total R: 30.0
Elapsed time(s):  10
Total R: 55.0
Elapsed time(s):  11
Total R: 120.0
Total R: 135.0
Elapsed time(s):  12
Elapsed time(s):  13
Elapsed time(s):  14
Total R: 105.0
Elapsed time(s):  15
Elapsed time(s):  16
Elapsed time(s):  17
Total R: 80.0
Total R: 255.0
Elapsed time(s):  18
Total R: 125.0
Total R: 110.0
Elapsed time(s):  19
Elapsed time(s):  20
Elapsed time(s):  21
Total R: 110.0
Elapsed time(s):  22
Elapsed time(s):  23
Elapsed time(s):  24
Total R: 80.0
Elapsed time(s):  25
Elapsed time(s):  26
Elapsed time(s):  27
Elapsed time(s):  28
Total R: 130.0
Total R: 415.0
Elapsed time(s):  29
Total R: 125.0
Total R: 210.0
Total R: 80.0
Total R: 0.0
Total R: 185.0
To

KeyboardInterrupt: 