In [2]:
import numpy as np
import tensorflow as tf

# [Catch](https://gist.github.com/EderSantana/c7222daa328f0e885093) MLP on raw pixels

## Setup Environment

In [49]:
class Catch(object): # 1 game is 1 fruit dropped from top to bottom
    def __init__(self, grid_size=10):
        self.grid_size = grid_size
        self.basketSize = 0 # actually this is 2*basket_size+1
        self.reset()
        
    def reset(self):
        n = np.random.randint(0, self.grid_size-1)  # starting fruit_col
        m = np.random.randint(1, self.grid_size-2)  # starting basket col
        self.state = np.asarray([0, n, m])          # [fruit_row, fruit_col, basket]
        return self.observe()
    
    def _get_reward(self):   # inc/dec score only if fruit has dropped to bottom
        fruit_row, fruit_col, basket = self.state
        if fruit_row == self.grid_size-1:
            return 1 if abs(fruit_col - basket) <= self.basketSize else -1
        else:
            return 0

    def _is_over(self):    # game over if fruit dropped to bottom
        return (self.state[0] == self.grid_size-1)
    
    def observe(self):
        im_size = (self.grid_size, self.grid_size)
        state = self.state
        canvas = np.zeros(im_size)
        canvas[self.state[0], self.state[1]] = 1                                         # draw fruit
        canvas[-1, self.state[2]-self.basketSize:self.state[2] + self.basketSize+1] = 1  # draw basket
        return canvas.flatten()
    
    def step(self, action):
        if action == 0:   action = -1 # move left
        elif action == 1: action =  0 # stay
        else:             action =  1 # move right
        f0, f1, basket = self.state
        new_basket = min(max(self.basketSize, basket + action), self.grid_size-self.basketSize)
        f0 += 1                       # fruit dropped by one pixel
        out = np.asarray([f0, f1, new_basket])
        self.state = out
        
        return self.observe(), self._get_reward(), self._is_over() # returns whole canvas, R, done?

* Store all intermediate states 
* after each step, train a random batch from memory
* train target are the Q-values computed from current Q_tables predicted my the current model (`model.predict(state)`)

## Deep Model

In [72]:
from collections import deque
class DQN(object):  #model+memory
    def __init__(self,catchenv,
                 gamma=.9, # gamma is reward decay in computing G_t=R_t+\gamma*R_{t+1}+...
                 max_memory=500):
        self.memory = deque(maxlen=max_memory)
        self.env = catchenv
        self.gamma = gamma
        
        self.model = tf.keras.models.Sequential([
                         tf.keras.layers.Dense(100, input_shape=(self.env.grid_size**2,), activation='relu'),
                         tf.keras.layers.Dense(100, activation='relu'),
                         tf.keras.layers.Dense(3),  # [move_left, stay, move_right]
                        ])
        self.model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam())
    
    def get_epsilon(self, episode):
        ###################### EPSILON to use for epsilon-greedy (probability of exploration)
#         return 1/(1+e*.2)
#         return max(.01, 0.995**episode)
        return 0.1
        #####################################################################################

    def choose_action(self, state, epsilon):
        if np.random.random() <= epsilon:
            return np.random.randint(0, num_actions, size=1)
        else:
            return np.argmax(self.model.predict(state[np.newaxis])[0])
        
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay_train(self, batch_size=50):
        x_batch, y_batch = [], []  #(batch_size, n_pixels-whole canvas), (batch_size, num_actions)
        minibatch = random.sample( self.memory, min(len(self.memory), batch_size))
        for state, action, reward, next_state, done in minibatch:
            y_target = self.model.predict(state[np.newaxis])[0]    # current estimation of the Q(s,a) (batch size 1)            
            Q_sa = np.max(model.predict(next_state[np.newaxis])[0])# max among 3 rewards (corr 3 actions) 
            if done:
                y_target[action] = reward 
            else: # reward_t + gamma * max_a' Q(s', a') ONLY for the action a that is executed; others remain unchanged
                y_target[action] = reward + self.gamma * Q_sa
            x_batch.append(state)
            y_batch.append(y_target)
            
        # learning rate handled by optimizer
#       self.model.fit(np.array(x_batch), np.array(y_batch), batch_size=len(x_batch), verbose=0)  # returns history
        return self.model.train_on_batch(np.array(x_batch), np.array(y_batch)) #returns current loss

    def run(self,episdoes=1000,explore=True):
        scores = deque(maxlen=episdoes//10) #store new episodes after previous print

        for e in range(episdoes):
            EPSILON = self.get_epsilon(e)*explore
            state = self.env.reset()
            done = False
            R = 0  #total return at the end of episode
            while not done:
                action = self.choose_action(state, EPSILON)
                next_state, reward, done = self.env.step(action)
                R += reward
                next_state = next_state
                self.remember(state, action, reward, next_state, done)
                state = next_state
            scores.append((R+1)/2)
            
            loss = self.replay_train()
            if (e+1)%(episdoes/10) == 0:
                print(f'Episode {e:05d} | Loss {loss:.4f} | Win rate {np.mean(scores):.3f}')

env=Catch(grid_size = 10)
agent = DQN(env)
agent.run()

Episode 00099 | Loss 0.0254 | Win rate 0.120
Episode 00199 | Loss 0.0261 | Win rate 0.140
Episode 00299 | Loss 0.0083 | Win rate 0.260
Episode 00399 | Loss 0.0084 | Win rate 0.280
Episode 00499 | Loss 0.0073 | Win rate 0.250
Episode 00599 | Loss 0.0296 | Win rate 0.230
Episode 00699 | Loss 0.0170 | Win rate 0.270
Episode 00799 | Loss 0.0048 | Win rate 0.290
Episode 00899 | Loss 0.0016 | Win rate 0.310
Episode 00999 | Loss 0.0016 | Win rate 0.340


## Evaluation (epsilon=0)

In [73]:
agent.run(100,explore=False)

Episode 00009 | Loss 0.0013 | Win rate 0.700
Episode 00019 | Loss 0.0042 | Win rate 0.500
Episode 00029 | Loss 0.0016 | Win rate 0.700
Episode 00039 | Loss 0.0090 | Win rate 0.700
Episode 00049 | Loss 0.0018 | Win rate 0.400
Episode 00059 | Loss 0.0041 | Win rate 0.700
Episode 00069 | Loss 0.0030 | Win rate 0.700
Episode 00079 | Loss 0.0003 | Win rate 0.900
Episode 00089 | Loss 0.0002 | Win rate 0.600
Episode 00099 | Loss 0.0011 | Win rate 0.700


## Visualization

In [76]:
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import animation, rc
from IPython.display import HTML
frames = []

for e in range(100):
    loss = 0.
    env.reset()
    done = False
    state = env.observe()
    frames.append(state.reshape(grid_size,grid_size))
    while not done:
        q = agent.model.predict(state[np.newaxis])  # q table at current state
        action = np.argmax(q[0])
        next_state, reward, done = env.step(action)
        frames.append(next_state.reshape(grid_size,grid_size))
        state = next_state
# plt.imshow(frames[9],interpolation='none', cmap='gray')

In [None]:
#animation
fig, ax = plt.subplots()
im  = ax.imshow(np.random.random((grid_size,)*2),interpolation='none', cmap='gray')
def init():
#     im.set_array(np.random.random((grid_size,grid_size)))
    return (im,)
def animate(i):
    im.set_array(frames[i])
    return (im,)
anim = animation.FuncAnimation(fig, animate, init_func=init,
                               frames=len(frames), interval=50, blit=True)
HTML(anim.to_html5_video())

In [None]:
#save to files
for i in range(len(frames)):
    plt.imshow(frames[i],interpolation='none', cmap='gray')
    plt.savefig("%03d.png" % i)

# [CartPole](https://gym.openai.com/envs/CartPole-v0/) MLP on 4-tuples

https://gym.openai.com/evaluations/eval_EIcM1ZBnQW2LBaFN6FY65g/

In [81]:
import random
import gym
import math
import numpy as np
from collections import deque
import tensorflow as tf

class DQN(object): # model + memory
    def __init__(self, env, # gym's env
                 gamma=.99, # gamma is reward decay in computing G_t=R_t+\gamma*R_{t+1}+...
                 max_memory=1000):
        self.memory = deque(maxlen=max_memory)
        self.env = env
        self.gamma = gamma

        self.model = tf.keras.models.Sequential()
        self.model.add(tf.keras.layers.Dense(24, input_shape=self.env.observation_space.shape, activation='relu'))
        self.model.add(tf.keras.layers.Dense(24, activation='relu'))
        self.model.add(tf.keras.layers.Dense(self.env.action_space.n, activation='linear'))
        self.model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(.001)) #optimizer has its parameters
    
    def get_epsilon(self, episode):
        ###################### EPSILON to use for epsilon-greedy (probability of exploration)
#         return 1/(1+e*.1)  
        return max(.01, 0.995**episode)
        #####################################################################################

    def choose_action(self, state, epsilon):
        if np.random.random() <= epsilon:
            return self.env.action_space.sample() 
        else:
            return np.argmax(self.model.predict(state[np.newaxis])[0])
        
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay_train(self, batch_size=50):
        x_batch, y_batch = [], []
        minibatch = random.sample( self.memory, min(len(self.memory), batch_size))
        for state, action, reward, next_state, done in minibatch:
            y_target = self.model.predict(state[np.newaxis])[0]
            y_target[action] = reward if done else reward + self.gamma * np.max(self.model.predict(next_state[np.newaxis])[0])
            x_batch.append(state)
            y_batch.append(y_target)
        
        self.model.fit(np.array(x_batch), np.array(y_batch), batch_size=len(x_batch), verbose=0)

    def run(self,episodes=2000,explore=True):
        scores = deque(maxlen=episodes//10) #only store most recent 100

        for e in range(episodes):
            EPSILON = self.get_epsilon(e)*explore
            state = self.env.reset()
            done = False
            R = 0  #total return at the end of episode
            while not done:
                action = self.choose_action(state, EPSILON)
                next_state, reward, done, info = self.env.step(action)
                R += reward
                next_state = next_state
                self.remember(state, action, reward, next_state, done)
                state = next_state
            scores.append(R)
            
            if (e+1)%(episodes/10) == 0:
                print(f'[Episode {e}] - Mean survival time over last 100 episodes was {np.mean(scores)}')

            self.replay_train()

env = gym.make('CartPole-v1')
agent = DQN(env)
agent.run()

[Episode 199] - Mean survival time over last 100 episodes was 15.775
[Episode 399] - Mean survival time over last 100 episodes was 14.82
[Episode 599] - Mean survival time over last 100 episodes was 16.485
[Episode 799] - Mean survival time over last 100 episodes was 26.53
[Episode 999] - Mean survival time over last 100 episodes was 71.77
[Episode 1199] - Mean survival time over last 100 episodes was 36.44
[Episode 1399] - Mean survival time over last 100 episodes was 48.74
[Episode 1599] - Mean survival time over last 100 episodes was 46.26
[Episode 1799] - Mean survival time over last 100 episodes was 81.4
[Episode 1999] - Mean survival time over last 100 episodes was 95.72
