## Atari Pong using Neural Networks

https://karpathy.github.io/2016/05/31/rl/

![Atari Pong](https://raw.githubusercontent.com/rlcode/reinforcement-learning/master/3-atari/2-pong/assets/pg.gif)

## Setup

Install OpenAI-gym and openai/atari-py:

```(mldds03) pip install gym```

```(mlssa03) pip install --no-index -f https://github.com/Kojoley/atari-py/releases atari_py```

## NumPy implementation

https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5

In [15]:
""" Trains an agent with (stochastic) Policy Gradients on Pong. Uses OpenAI Gym. """
import numpy as np
import pickle
import gym

max_episodes = 100 # how many episodes to run
resume = False # resume from previous checkpoint?

# hyperparameters
H = 200 # number of hidden layer neurons
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-4
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
render = False

# model initialization
D = 80 * 80 # input dimensionality: 80x80 grid
if resume:
    model = pickle.load(open('save.p', 'rb'))
else:
    model = {}
    model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
    model['W2'] = np.random.randn(H) / np.sqrt(H)

grad_buffer = { k : np.zeros_like(v) for k,v in model.items() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.items() } # rmsprop memory

def sigmoid(x): 
    return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]

def prepro(I):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
    I = I[35:195] # crop
    I = I[::2,::2,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return I.astype(np.float).ravel()

def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

def policy_forward(x):
    h = np.dot(model['W1'], x)
    h[h<0] = 0 # ReLU nonlinearity
    logp = np.dot(model['W2'], h)
    p = sigmoid(logp)
    return p, h # return probability of taking action 2, and hidden state

def policy_backward(eph, epdlogp):
    """ backward pass. (eph is array of intermediate hidden states) """
    dW2 = np.dot(eph.T, epdlogp).ravel()
    dh = np.outer(epdlogp, model['W2'])
    dh[eph <= 0] = 0 # backpro prelu
    dW1 = np.dot(dh.T, epx)
    return {'W1':dW1, 'W2':dW2}

env = gym.make("Pong-v0")
observation = env.reset()
prev_x = None # used in computing the difference frame
xs,hs,dlogps,drs = [],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 0

while episode_number < max_episodes:
    if render: env.render()

    # preprocess the observation, set input to network to be difference image
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    prev_x = cur_x

    # forward the policy network and sample an action from the returned probability
    aprob, h = policy_forward(x)
    action = 2 if np.random.uniform() < aprob else 3 # roll the dice!

    # record various intermediates (needed later for backprop)
    xs.append(x) # observation
    hs.append(h) # hidden state
    y = 1 if action == 2 else 0 # a "fake label"
    dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

    if done: # an episode finished
        episode_number += 1

        # stack together all inputs, hidden states, action gradients, and rewards for this episode
        epx = np.vstack(xs)
        eph = np.vstack(hs)
        epdlogp = np.vstack(dlogps)
        epr = np.vstack(drs)
        xs,hs,dlogps,drs = [],[],[],[] # reset array memory

        # compute the discounted reward backwards through time
        discounted_epr = discount_rewards(epr)
        # standardize the rewards to be unit normal (helps control the gradient estimator variance)
        discounted_epr -= np.mean(discounted_epr)
        discounted_epr /= np.std(discounted_epr)

        epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
        grad = policy_backward(eph, epdlogp)
        for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

        # perform rmsprop parameter update every batch_size episodes
        if episode_number % batch_size == 0:
            for k,v in model.items():
                g = grad_buffer[k] # gradient
                rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
                model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
                grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

        # boring book-keeping
        running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
        print('resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward))
        if episode_number % 10 == 0: pickle.dump(model, open('save.p', 'wb'))
        reward_sum = 0
        observation = env.reset() # reset env
        prev_x = None

        if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
            print('ep %d: game finished, reward: %f' % (episode_number, reward))

resetting env. episode reward total was -21.000000. running mean: -21.000000
ep 1: game finished, reward: -1.000000
resetting env. episode reward total was -20.000000. running mean: -20.990000
ep 2: game finished, reward: -1.000000
resetting env. episode reward total was -20.000000. running mean: -20.980100
ep 3: game finished, reward: -1.000000
resetting env. episode reward total was -21.000000. running mean: -20.980299
ep 4: game finished, reward: -1.000000
resetting env. episode reward total was -21.000000. running mean: -20.980496
ep 5: game finished, reward: -1.000000
resetting env. episode reward total was -21.000000. running mean: -20.980691
ep 6: game finished, reward: -1.000000
resetting env. episode reward total was -20.000000. running mean: -20.970884
ep 7: game finished, reward: -1.000000
resetting env. episode reward total was -21.000000. running mean: -20.971175
ep 8: game finished, reward: -1.000000
resetting env. episode reward total was -21.000000. running mean: -20.97

## Keras implementation

https://github.com/rlcode/reinforcement-learning/tree/master/3-atari/2-pong

Should converge after 8000 episodes.

![scores](https://raw.githubusercontent.com/rlcode/reinforcement-learning/master/3-atari/2-pong/assets/score.png)

In [None]:
# https://github.com/rlcode/reinforcement-learning/blob/master/3-atari/2-pong/pong_reinforce.py

import gym
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Reshape, Flatten, Conv2D
from keras.optimizers import Adam
from keras.callbacks import TensorBoard
import tensorflow as tf # only used for TensorBoard
import time

max_episodes = 10 # how many episodes to run
# max_episodes = 8000 # (win - lose) should converge after 8000 runs
resume = False # resume from previous checkpoint?

# tensorboard --logdir=logs --host=0.0.0.0

class PGAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.99
        self.learning_rate = 0.001
        self.states = []
        self.gradients = []
        self.rewards = []
        self.probs = []
        self.model = self._build_model()
        self.model.summary()
        self.tensorboard = TensorBoard(log_dir='logs/atari_pong/%d' % time.time())
        self.tensorboard.set_model(self.model)

    def _build_model(self):
        model = Sequential()
        model.add(Reshape((1, 80, 80), input_shape=(self.state_size,)))
        model.add(Conv2D(32, (6, 6), activation="relu", strides=(3, 3),
                         padding="same", kernel_initializer="he_uniform"))
        model.add(Flatten())
        model.add(Dense(64, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(self.action_size, activation='softmax'))
        opt = Adam(lr=self.learning_rate)
        
        #
        # See note regarding crossentropy in cartpole_reinforce.py
        # https://github.com/rlcode/reinforcement-learning/blob/master/2-cartpole/3-reinforce/cartpole_reinforce.py
        # 
        # Using categorical crossentropy as a loss is a trick to easily
        # implement the policy gradient. Categorical cross entropy is defined
        # H(p, q) = sum(p_i * log(q_i)). For the action taken, a, you set 
        # p_a = advantage. q_a is the output of the policy network, which is
        # the probability of taking the action a, i.e. policy(s, a). 
        # All other p_i are zero, thus we have H(p, q) = A * log(policy(s, a))
        #
        
        model.compile(loss='categorical_crossentropy', optimizer=opt)
        return model

    def remember(self, state, action, prob, reward):
        y = np.zeros([self.action_size])
        y[action] = 1
        self.gradients.append(np.array(y).astype('float32') - prob)
        self.states.append(state)
        self.rewards.append(reward)

    def act(self, state):
        state = state.reshape([1, state.shape[0]])
        aprob = self.model.predict(state, batch_size=1).flatten()
        self.probs.append(aprob)
        prob = aprob / np.sum(aprob)
        action = np.random.choice(self.action_size, 1, p=prob)[0]
        return action, prob

    def discount_rewards(self, rewards):
        discounted_rewards = np.zeros_like(rewards)
        running_add = 0
        for t in reversed(range(0, rewards.size)):
            if rewards[t] != 0:
                running_add = 0
            running_add = running_add * self.gamma + rewards[t]
            discounted_rewards[t] = running_add
        return discounted_rewards

    def train(self):
        gradients = np.vstack(self.gradients)
        rewards = np.vstack(self.rewards)
        rewards = self.discount_rewards(rewards)
        rewards = rewards / np.std(rewards - np.mean(rewards))
        gradients *= rewards
        X = np.squeeze(np.vstack([self.states]))
        Y = self.probs + self.learning_rate * np.squeeze(np.vstack([gradients]))
        self.model.train_on_batch(X, Y)
        self.states, self.probs, self.gradients, self.rewards = [], [], [], []

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

    def log_score(self, episode, score):
        summary = tf.Summary()
        summary_value = summary.value.add()
        summary_value.simple_value = score
        summary_value.tag = 'game_score'
        self.tensorboard.writer.add_summary(summary, episode)
        self.tensorboard.writer.flush()

def preprocess(I):
    I = I[35:195]
    I = I[::2, ::2, 0]
    I[I == 144] = 0
    I[I == 109] = 0
    I[I != 0] = 1
    return I.astype(np.float).ravel()

# Main loop
env = gym.make("Pong-v0")
state = env.reset()
prev_x = None
score = 0
episode = 0
model_filename = 'pong_rl_keras.h5'

state_size = 80 * 80
action_size = env.action_space.n
agent = PGAgent(state_size, action_size)

if resume:
    agent.load(model_filename)

while episode < max_episodes:
    env.render()

    cur_x = preprocess(state)
    x = cur_x - prev_x if prev_x is not None else np.zeros(state_size)
    prev_x = cur_x

    action, prob = agent.act(x)
    state, reward, done, info = env.step(action)
    score += reward
    
    agent.remember(x, action, prob, reward)

    if done:
        agent.log_score(episode, score)
        episode += 1
        agent.train()
        print('Episode: %d - Score: %f.' % (episode, score))
        score = 0
        state = env.reset()
        prev_x = None
        if episode > 1 and episode % 10 == 0:
            agent.save(model_filename)

env.close() # close the window

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_1 (Reshape)          (None, 1, 80, 80)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 1, 27, 32)         92192     
_________________________________________________________________
flatten_1 (Flatten)          (None, 864)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                55360     
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 198       
Total params: 149,830
Trainable params: 149,830
Non-trainable params: 0
_________________________________________________________________
Epis