In [1]:
import numpy as np
import tensorflow as tf
import random
tf.__version__

'2.0.0-beta1'

# [Catch](https://gist.github.com/EderSantana/c7222daa328f0e885093) MLP on raw pixels

network input: `state (raw pixels)` output: `Q(a|state)`

## Setup Environment

In [2]:
class Catch(object): # 1 game is 1 fruit dropped from top to bottom
    def __init__(self, grid_size=10):
        self.grid_size = grid_size
        self.basketSize = 0 # actually this is 2*basket_size+1
        self.reset()
        
    def reset(self):
        n = np.random.randint(0, self.grid_size-1)  # starting fruit_col
        m = np.random.randint(1, self.grid_size-2)  # starting basket col
        self.state = np.asarray([0, n, m])          # [fruit_row, fruit_col, basket]
        return self.observe()
    
    def _get_reward(self):   # inc/dec score only if fruit has dropped to bottom
        fruit_row, fruit_col, basket = self.state
        if fruit_row == self.grid_size-1:
            return 1 if abs(fruit_col - basket) <= self.basketSize else -1
        else:
            return 0

    def _is_over(self):    # game over if fruit dropped to bottom
        return (self.state[0] == self.grid_size-1)
    
    def observe(self):
        im_size = (self.grid_size, self.grid_size)
        state = self.state
        canvas = np.zeros(im_size)
        canvas[self.state[0], self.state[1]] = 1                                         # draw fruit
        canvas[-1, self.state[2]-self.basketSize:self.state[2] + self.basketSize+1] = 1  # draw basket
        return canvas.flatten()
    
    def step(self, action):
        if action == 0:   action = -1 # move left
        elif action == 1: action =  0 # stay
        else:             action =  1 # move right
        f0, f1, basket = self.state
        new_basket = min(max(self.basketSize, basket + action), self.grid_size-self.basketSize)
        f0 += 1                       # fruit dropped by one pixel
        out = np.asarray([f0, f1, new_basket])
        self.state = out
        
        return self.observe(), self._get_reward(), self._is_over() # returns whole canvas, R, done?

* Store all intermediate states 
* after each step, train a random batch from memory
* train target are the Q-values computed from current Q_tables predicted my the current model (`model.predict(state)`)

## Deep Model

In [36]:
from collections import deque
class DQN(object):  #model+memory
    def __init__(self,catchenv,
                 gamma=.9, # gamma is reward decay in computing G_t=R_t+\gamma*R_{t+1}+...
                 max_memory=500):
        self.memory = deque(maxlen=max_memory)
        self.env = catchenv
        self.gamma = gamma
        
        self.model = tf.keras.models.Sequential([
                         tf.keras.layers.Dense(100, input_shape=(self.env.grid_size**2,), activation='relu'),
                         tf.keras.layers.Dense(100, activation='relu'),
                         tf.keras.layers.Dense(3),  # [move_left, stay, move_right]
                        ])
        self.model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam())     # learning rate handled by optimizer
        self.target_model = tf.keras.models.clone_model(self.model)
    
    def get_epsilon(self, episode):
        ###################### EPSILON to use for epsilon-greedy (probability of exploration)
#         return 1/(1+e*.2)
#         return max(.01, 0.995**episode)
        return 0.1
        #####################################################################################

    def choose_action(self, state, epsilon):
        if np.random.random() <= epsilon:
            return np.random.randint(0, 3, size=1)
        else:
            return np.argmax(self.model.predict(state[np.newaxis])[0])
        
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay_train(self, batch_size=50):
        X, Y = [], []  #(batch_size, n_pixels-whole canvas), (batch_size, num_actions)
        minibatch = random.sample( self.memory, min(len(self.memory), batch_size))
        ys=self.model.predict(np.array([e[0] for e in minibatch]))         # current estimation of the Q(a|s)
        qs=self.target_model.predict(np.array([e[3] for e in minibatch]))  # Q(a|s') for next state (target is r+g*max(this) for the action taken, otherwise use current estimation as the target)

        for i,(state, action, reward, next_state, done) in enumerate(minibatch):
            y = ys[i]  #self.model.predict(state[np.newaxis])[0]
            q = qs[i]  #self.model.predict(next_state[np.newaxis])[0]
            y[action] = reward + self.gamma*(0 if done else np.max(q))     # R_t + gam * max_a' Q(s',a') ONLY for the action executed; others remain unchanged from current prediction
            X += state,
            Y += y,
            
        return self.model.train_on_batch(np.array(X), np.array(Y)) #returns current loss

    def run(self,episdoes=1000,eval_mode=False):
        scores = deque(maxlen=episdoes//10) #store new episodes after previous print

        for e in range(1,episdoes+1):
            EPSILON = 0 if eval_mode else self.get_epsilon(e)
            state = self.env.reset()
            done = False
            R = 0  #total return at the end of episode
            while not done:
                action = self.choose_action(state, EPSILON)
                next_state, reward, done = self.env.step(action)
                R += reward
                next_state = next_state
                self.remember(state, action, reward, next_state, done)
                state = next_state
            loss = np.nan if eval_mode else self.replay_train()
            scores.append((R+1)/2)
            
            if e%(episdoes/10) == 0:
                print(f'Episode {e:05d} | Loss {loss:.4f} | Win rate {np.mean(scores):.3f}')
            if e%1==0:  #update frequency of target network
                self.target_model.set_weights(self.model.get_weights())

env=Catch(grid_size = 10)
agent = DQN(env)
agent.run(1000)

Episode 00100 | Loss 0.0081 | Win rate 0.120
Episode 00200 | Loss 0.0054 | Win rate 0.190
Episode 00300 | Loss 0.0092 | Win rate 0.210
Episode 00400 | Loss 0.0051 | Win rate 0.230
Episode 00500 | Loss 0.0123 | Win rate 0.380
Episode 00600 | Loss 0.0055 | Win rate 0.370
Episode 00700 | Loss 0.0080 | Win rate 0.620
Episode 00800 | Loss 0.0048 | Win rate 0.580
Episode 00900 | Loss 0.0020 | Win rate 0.640
Episode 01000 | Loss 0.0017 | Win rate 0.720


## Evaluation (epsilon=0)

In [35]:
agent.run(100,eval_mode=True)

Episode 00010 | Loss nan | Win rate 0.700
Episode 00020 | Loss nan | Win rate 0.600
Episode 00030 | Loss nan | Win rate 0.800
Episode 00040 | Loss nan | Win rate 0.700
Episode 00050 | Loss nan | Win rate 0.600
Episode 00060 | Loss nan | Win rate 0.900
Episode 00070 | Loss nan | Win rate 0.800
Episode 00080 | Loss nan | Win rate 0.900
Episode 00090 | Loss nan | Win rate 0.500
Episode 00100 | Loss nan | Win rate 0.800


## Visualization

In [76]:
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import animation, rc
from IPython.display import HTML
frames = []

for e in range(100):
    loss = 0.
    env.reset()
    done = False
    state = env.observe()
    frames.append(state.reshape(grid_size,grid_size))
    while not done:
        q = agent.model.predict(state[np.newaxis])  # q table at current state
        action = np.argmax(q[0])
        next_state, reward, done = env.step(action)
        frames.append(next_state.reshape(grid_size,grid_size))
        state = next_state
# plt.imshow(frames[9],interpolation='none', cmap='gray')

In [None]:
#animation
fig, ax = plt.subplots()
im  = ax.imshow(np.random.random((grid_size,)*2),interpolation='none', cmap='gray')
def init():
#     im.set_array(np.random.random((grid_size,grid_size)))
    return (im,)
def animate(i):
    im.set_array(frames[i])
    return (im,)
anim = animation.FuncAnimation(fig, animate, init_func=init,
                               frames=len(frames), interval=50, blit=True)
HTML(anim.to_html5_video())

In [None]:
#save to files
for i in range(len(frames)):
    plt.imshow(frames[i],interpolation='none', cmap='gray')
    plt.savefig("%03d.png" % i)

# [CartPole](https://gym.openai.com/envs/CartPole-v0/)

##  MLP(value+target) on 4-tuples

https://gym.openai.com/evaluations/eval_EIcM1ZBnQW2LBaFN6FY65g/, https://gym.openai.com/evaluations/eval_OeUSZwUcR2qSAqMmOE1UIw/

In [39]:
import random
import gym
import math
import numpy as np
from collections import deque
import tensorflow as tf

class DQN(object): # model + memory
    def __init__(self, env, # gym's env
                 gamma=.99, # gamma is reward decay in computing G_t=R_t+\gamma*R_{t+1}+...
                 max_memory=1000):
        self.memory = deque(maxlen=max_memory)
        self.env = env
        self.gamma = gamma

        self.model = tf.keras.models.Sequential()
        self.model.add(tf.keras.layers.Dense(16, input_shape=self.env.observation_space.shape, activation='relu'))
        self.model.add(tf.keras.layers.Dense(16, activation='relu'))
        self.model.add(tf.keras.layers.Dense(self.env.action_space.n, activation='linear'))
        self.model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(.001)) #optimizer has its parameters
        
        self.target_model = tf.keras.models.clone_model(self.model)
    
    ###################### EPSILON to use for epsilon-greedy (probability of exploration)##############################
    def get_epsilon(self, episode):
#         return 1/(1+episode*.1)  
#         return max(.01, 0.995**episode)
        return 0.1
    ###################################################################################################################

    def choose_action(self, state, epsilon):
        if np.random.random() <= epsilon:
            return self.env.action_space.sample() 
        else:
            return np.argmax(self.model.predict(state[np.newaxis])[0])
        
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay_train(self, batch_size=32):
        X, Y = [], []  #(batch_size, n_pixels-whole canvas), (batch_size, num_actions)
        
        minibatch = random.sample( self.memory, min(len(self.memory), batch_size))
        ys=self.model.predict(np.array([e[0] for e in minibatch]))      # current estimation of the Q(a|s)
        qs=self.model.predict(np.array([e[3] for e in minibatch]))      # Q(a|s') for next state (target is r+g*max(this) for the action taken, otherwise use current estimation as the target)
#         qs should use target_model

        for i,(state, action, reward, next_state, done) in enumerate(minibatch):
            y = ys[i]  #self.model.predict(state[np.newaxis])[0]
            q = qs[i]  #self.model.predict(next_state[np.newaxis])[0]
            y[action] = reward + (0 if done else self.gamma*np.max(q))     # R_t + gam * max_a' Q(s',a') ONLY for the action executed; others remain unchanged from current prediction
            X += state,
            Y += y,
            
        return self.model.train_on_batch(np.array(X), np.array(Y)) #returns current loss

    def run(self,episodes=2000,eval_mode=False):
        scores = deque(maxlen=episodes//10)

        for e in range(1,episodes+1):
            EPSILON = 0 if eval_mode else self.get_epsilon(e)
            state = self.env.reset()
            done = False
            TotalR = 0
            while not done:
                action = self.choose_action(state, EPSILON)
                next_state, reward, done, info = self.env.step(action)
                TotalR += reward
                next_state = next_state
#                 if done: reward=-200  # important(?)
                self.remember(state, action, reward, next_state, done)
                state = next_state
                if not eval_mode: self.replay_train()   # train once every step better than once every episode
            scores.append(TotalR)
            
            if e%(episodes/10) == 0:
                print(f'Episode {e:05d}: Mean Time {np.mean(scores):.3f}, Max Time {np.max(scores):.0f}')
            
            if e%1==0:
                self.target_model.set_weights(self.model.get_weights())

env = gym.make('CartPole-v0')
agent = DQN(env)
agent.run(500)

Episode 00050: Mean Time 10.380, Max Time 13
Episode 00100: Mean Time 50.940, Max Time 111
Episode 00150: Mean Time 61.680, Max Time 200
Episode 00200: Mean Time 196.580, Max Time 200
Episode 00250: Mean Time 151.740, Max Time 200
Episode 00300: Mean Time 138.980, Max Time 200
Episode 00350: Mean Time 192.720, Max Time 200
Episode 00400: Mean Time 197.160, Max Time 200
Episode 00450: Mean Time 193.620, Max Time 200
Episode 00500: Mean Time 136.940, Max Time 200


In [40]:
# evaluation
agent.run(100,eval_mode=True)

Episode 00010: Mean Time 200.000, Max Time 200
Episode 00020: Mean Time 200.000, Max Time 200
Episode 00030: Mean Time 200.000, Max Time 200
Episode 00040: Mean Time 200.000, Max Time 200
Episode 00050: Mean Time 199.600, Max Time 200
Episode 00060: Mean Time 200.000, Max Time 200
Episode 00070: Mean Time 200.000, Max Time 200
Episode 00080: Mean Time 200.000, Max Time 200
Episode 00090: Mean Time 199.900, Max Time 200
Episode 00100: Mean Time 200.000, Max Time 200


## CNN on raw pixels

https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html

network input: `state (raw pixels)` output: `Q(a|state)`

In [43]:
import torch
import torch.nn as nn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class DQN(nn.Module):

    def __init__(self, h, w, outputs):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=5, stride=2)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
        self.bn3 = nn.BatchNorm2d(32)

        # Number of Linear input connections depends on output of conv2d layers
        # and therefore the input image size, so compute it.
        def conv2d_size_out(size, kernel_size = 5, stride = 2):
            return (size - (kernel_size - 1) - 1) // stride  + 1
        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
        linear_input_size = convw * convh * 32
        self.head = nn.Linear(linear_input_size, outputs)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        return self.head(x.view(x.size(0), -1))


In [44]:
env = gym.make('CartPole-v0')