In [65]:
import numpy as np
import random
import gym
from collections import deque # for finite memory 
import tensorflow as tf
tf.__version__

'2.0.0-beta1'

# Implementations
* `env` must support the methods `.reset()`, `.step()` with returns like `gym`'s
* if `env` is not `gym`'s environment, must provide `state_dim` and `action_dim`

## Deep Value Network, Q learning
* network input: `state (raw pixels)` output: `Q(a|state)`
* target are the Q-values by target_network on *the state/action actually followed*. Others not used for training (just set the target to what the model predicts now, so loss is zero)
* the loss `mse` is chosen such that gradient of loss is the update we want on the parameters: $-\nabla[r+\gamma\max_{a'}Q(s',a')-\hat{Q}_{\bf w}(s,a)]^2=[r+\gamma\max_{a'}Q(s',a')-\hat{Q}_{\bf w}(s,a)]\nabla_{\bf w}\hat{Q}_{\bf w}\rightarrow\Delta {\bf w}$

[Three improvements](https://www.youtube.com/watch?v=EX1CIVVkWdE):
1. **Experience Replay**: If we only use most recent states, the training samples are highly correlated (all coming from the same episodes). Store all intermediate states; after each step, train a random batch from memory.
2. **Target Network**: Training is difficult to converge if the target is constantly shifting. The TD target is output from another network that got updated to the most up-to-date value network once in a while.
3. **Huber Loss**: Avoid exploding gradients by using Huber loss instead of `mse`

In [66]:
class DQN(object):  #model+memory
    def __init__(self,env,state_dim=None, action_dim=None,
                 gamma=.9,max_memory=5000):
        self.memory = deque(maxlen=max_memory)
        self.env = env
        self.gamma = gamma
        self.action_dim = action_dim or self.env.action_space.n
        self.state_shape = (state_dim,) if state_dim else self.env.observation_space.shape
        
        self.model = tf.keras.models.Sequential([
                         tf.keras.layers.Dense(96, input_shape=self.state_shape, activation='relu'),
                         tf.keras.layers.Dense(48, activation='relu'),
                         tf.keras.layers.Dense(self.action_dim),
                        ])
        self.model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam())     # learning rate handled by optimizer
        self.target_model = tf.keras.models.clone_model(self.model)
    
    def get_epsilon(self, episode):
        ############# EPSILON to use for epsilon-greedy (probability of exploration)#########
#         return 1/(1+e*.2)
#         return max(.01, 0.995**episode)
        return 0.1
        #####################################################################################

    def choose_action(self, state, epsilon):
        if np.random.random() <= epsilon:
            return np.random.randint(0, self.action_dim)
        else:
            return np.argmax(self.model.predict(state[np.newaxis])[0])
        
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay_train(self, batch_size=50):
        batch = random.sample( self.memory, min(len(self.memory), batch_size))
        states,actions,rewards,nextstates,dones = map(np.array,zip(*batch))        
        ys=self.model.predict(states)             # current estimation of the Q(a|s)
        qs=self.target_model.predict(nextstates)  # Q(a|s') for next state (target is r+g*max(this) for the action taken, otherwise use current estimation as the target)
        
#         X, Y = [], [] 
#         for i,(state, action, reward, next_state, done) in enumerate(batch):
#             y = ys[i]  # = self.model.predict(state[np.newaxis])[0]
#             q = qs[i]  # = self.model.predict(next_state[np.newaxis])[0]
#             y[action] = reward + (0 if done else self.gamma*np.max(q)) # R_t + gam * max_a' Q(s',a') ONLY for the action executed; others remain unchanged from current prediction
#             X += state,
#             Y += y,
#         return self.model.train_on_batch(np.array(X), np.array(Y))   

        ys[np.arange(ys.shape[0]),actions] = rewards + (1-dones)*self.gamma*np.max(qs,axis=1) #equivalent to above
        return self.model.train_on_batch(states, ys)
              
    def run(self,episodes=1000,show_every=None,fit_data='per_step',update_target_every=2):
        show_every = show_every or episodes//10
        scores = deque(maxlen=show_every) #store new episodes after previous print

        for e in range(1,episodes+1):
            EPSILON = 0 if not fit_data else self.get_epsilon(e)
            state, done = self.env.reset(), False
            R = 0
            while not done:
                action = self.choose_action(state, EPSILON)
                next_state, reward, done,_ = self.env.step(action)
                R += reward
                self.remember(state, action, reward, next_state, done)
                state = next_state
                if fit_data=='per_step': self.replay_train()
            if fit_data=='per_episode': self.replay_train()
            scores+=R,
                     
            if e%show_every == 0:
                print(f'Episode {e:4d} | Average R {np.mean(scores):6.4g} | Median R {np.median(scores)}')
            if e%update_target_every==0:  #update frequency of target network
                self.target_model.set_weights(self.model.get_weights())

## Deep Policy Network, MC
* no epsilon-greedy necessary since network output softmax (probabilities), not deterministic
* loss `categorical_crossentropy` is chosen such that gradient of loss is the update we want on the network parameters $\theta$:

  Loss $L(v, \hat{p}) = -\sum_av_a \log \hat{p}_a$, where the targets used are $v_a$=value or advantage following action $a$ that was actually taken (zeros for other non-taken actions), and $\hat{p}_a$ is output from policy network for the probability of choosing action $a$. So $L=-v_t\log\pi_\theta(s,a)$, and $-\nabla_\theta L=v_t\nabla_\theta\log\pi_\theta(s,a)\rightarrow\Delta\theta$, exactly what wanted.
* use the mean return of the whole episode as the baseline -- a constant value
* calculate the correct targets at the end of an episode, then add to memory for replay
* cannot use TD as there is no estimation of value function available
* train batch size should be ~ episode length? train once per episode or per step?
* ?not good for game that does have reward only at the end?

In [73]:
class PolicyNet(object):  #model+memory
    def __init__(self,env,state_dim=None, action_dim=None,
                 gamma=.9,max_memory=None):
        self.memory = deque(maxlen=max_memory)
        self.env = env
        self.gamma = gamma
        self.action_dim = action_dim or self.env.action_space.n
        self.state_shape = (state_dim,) if state_dim else self.env.observation_space.shape
        
        self.model = tf.keras.models.Sequential([
                         tf.keras.layers.Dense(24, input_shape=self.state_shape, activation='relu'),
                         tf.keras.layers.Dense(24, activation='relu'),
                         tf.keras.layers.Dense(self.action_dim, activation='softmax'),
                        ])
        self.model.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.RMSprop())
        
    def choose_action(self, state):
        policy = self.model.predict(state[np.newaxis])[0]
        return np.random.choice(self.action_dim, p=policy)

    def remember(self,mem):
        states,actions,Gs = zip(*mem)
        
        Gs = np.asarray(Gs,dtype=np.float32) #convert rewards to returns
        G = 0  
        for t in reversed(range(len(states))):
            G = G * self.gamma + Gs[t]
            Gs[t] = G            
        Gs -= np.mean(Gs)
        Gs /= np.std(Gs)+1e-12    #subtract a uniform value 
        
        Y = np.zeros((len(states), self.action_dim))
        for t in range(len(states)):
            Y[t,actions[t]] = Gs[t]
        
        self.memory.extend(zip(states,Y))
#         self.memory=list(zip(states,Y))           #only keep last episode
        
    def replay_train(self, batch_size=64):
        if not self.memory: return
        batch = random.sample( self.memory, min(len(self.memory), batch_size))
        X,Y = map(np.asarray,zip(*batch))        
        return self.model.train_on_batch(X, Y)
              
    def run(self,episodes=1000,show_every=None,fit_data='per_step'):
        show_every = show_every or episodes//10
        scores = deque(maxlen=show_every)
        
        for e in range(1,episodes+1):
            state,done = self.env.reset(),False
            short_term_mem = []
            R = 0
            while not done:
                action = self.choose_action(state)
                next_state, reward, done, _ = self.env.step(action)
                R += reward
                short_term_mem += (state, action, reward),
                state = next_state
                if done: self.remember(short_term_mem)
                if fit_data=='per_step': self.replay_train()
            if fit_data=='per_episode': self.replay_train()
            scores+=R,
            
            if e%show_every == 0:
                print(f'Episode {e:4d} | Average R {np.mean(scores):6.4g} | Median R {np.median(scores)}')

## Deep policy+value (Actor-Critic) networks
* Policy network predicts probabilities of actions given state, $\pi_\theta(a|s)$. Value network estimates *state*-value function  for the given state $V_v(s)$
* Policy network loss is $L= -\sum_a[r+\gamma V_v(s_{t+1})-V_v(s_t)]\log\pi_\theta(a|s)$, so $-\nabla_\theta L=[r+\gamma V_v(s_{t+1})-V_v(s_t)]\nabla_\theta\log\pi_\theta(s,a)\rightarrow\Delta\theta$
* Value network loss is $L= [r+\gamma V_v(s_{t+1})-V_v(s_t)]^2$, so $-\nabla_v L =[r+\gamma V_v(s_{t+1})-V_v(s_t)]\nabla_v V_v(s_t)\rightarrow\Delta {\bf v}$
* update by TD(0) now as there is a network to give estimation of V(s)

In [93]:
class ActorCritic(object):  #models+memory
    def __init__(self,env,state_dim=None, action_dim=None,
                 gamma=.9,
                 max_memory=5000):
        self.memory = deque(maxlen=max_memory)
        self.env = env
        self.gamma = gamma
        self.action_dim = action_dim or self.env.action_space.n
        self.state_shape = (state_dim,) if state_dim else self.env.observation_space.shape
        
        self.actor = tf.keras.models.Sequential([
                         tf.keras.layers.Dense(48, input_shape=self.state_shape, activation='relu',kernel_initializer='he_uniform'),
                         tf.keras.layers.Dense(self.action_dim, activation='softmax',kernel_initializer='he_uniform'),
                     ])
        self.actor.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(lr=0.001))
        
        self.critic = tf.keras.models.Sequential([
                         tf.keras.layers.Dense(48, input_shape=self.state_shape, activation='relu',kernel_initializer='he_uniform'),
                         tf.keras.layers.Dense(1, kernel_initializer='he_uniform'),  #value of the state
                     ])
        self.critic.compile(loss="mse", optimizer=tf.keras.optimizers.Adam(lr=0.005))
        
    def choose_action(self, state):
        policy = self.actor.predict(state[np.newaxis])[0]
        return np.random.choice(self.action_dim, p=policy)

    def remember(self,state, action, reward, next_state, done):
        self.memory += (state, action, reward, next_state, done),
        
    def replay_train(self, batch_size=32):
        if not self.memory: return
        batch_size = min(len(self.memory), batch_size)
        batch = random.sample( self.memory, batch_size)
        states,actions,rewards,next_states,dones = map(np.array,zip(*batch))        
        
        value_target = np.zeros((batch_size, 1))
        advantages   = np.zeros((batch_size, self.action_dim))
        
        curr_values = self.critic.predict(states).squeeze()
        next_values = self.critic.predict(next_states).squeeze()
        
        advantages[np.arange(batch_size),actions] = rewards + (1-dones)*self.gamma*next_values - curr_values
        value_target[:,0] = rewards + (1-dones)*self.gamma*next_values
    
        return self.actor.train_on_batch(states,advantages),\
               self.critic.train_on_batch(states,value_target), 
              
    def run(self,episodes=1000,show_every=None,fit_data='per_step'):
        show_every = show_every or episodes//10
        scores = deque(maxlen=show_every)
        
        for e in range(1,episodes+1):
            state,done = self.env.reset(),False
            short_term_mem = []
            R = 0
            while not done:
                action = self.choose_action(state)
                next_state, reward, done, _ = self.env.step(action)
                R += reward
                self.remember(state, action, reward, next_state, done)
                state = next_state
                if fit_data=='per_step': self.replay_train(128)
            if fit_data=='per_episode': self.replay_train(128)
            scores+=R,
            
            if e%show_every == 0:
                print(f'Episode {e:4d} | Average R {np.mean(scores):6.4g} | Median R {np.median(scores)}')

# [Catch](https://gist.github.com/EderSantana/c7222daa328f0e885093) using raw pixels

## Setup Environment

In [69]:
class Catch(object): # 1 game is 1 fruit dropped from top to bottom
    def __init__(self, grid_size=10):
        self.grid_size = grid_size
        self.basketSize = 1
        
    def reset(self):
        n = np.random.randint(0, self.grid_size-1)                # starting fruit_col
        m = np.random.randint(0, self.grid_size-self.basketSize)  # starting basket col
        self.state = np.asarray([0, n, m])                        # [fruit_row, fruit_col, basket's left end]
        return self.observe()
    
    def _get_reward(self):   # inc/dec score only if fruit has dropped to bottom
        fruit_row, fruit_col, basket_left = self.state
        if fruit_row == self.grid_size-1 and basket_left <= fruit_col < basket_left+self.basketSize:
            return 1.
        else:
            return 0.

    def _is_over(self):    # game over if fruit dropped to bottom
        return (self.state[0] == self.grid_size-1)
    
    def observe(self):
        im_size = (self.grid_size, self.grid_size)
        state = self.state
        canvas = np.zeros(im_size)
        canvas[self.state[0], self.state[1]] = 1                         # draw fruit
        canvas[-1, self.state[2]:self.state[2] + self.basketSize+1] = 1  # draw basket
        return canvas.flatten()
    
    def step(self, action):
        if action == 0:   action = -1 # move left
        elif action == 1: action =  0 # stay
        else:             action =  1 # move right
        f0, f1, basket_left = self.state
        new_basket_left = min(max(0, basket_left + action), self.grid_size-self.basketSize)
        f0 += 1                       # fruit dropped by one pixel
        out = np.asarray([f0, f1, new_basket_left])
        self.state = out
        
        return self.observe(), self._get_reward(), self._is_over(), None # returns whole canvas, R, done?

`gym`'s environment

In [70]:
class Catch(gym.Env):   # 1 game is 1 fruit dropped from top to bottom. agent at bottom row to catch it
    metadata = {'render.modes': ['human']}

    def __init__(self, grid_size=10,basket_size=1):
        super(Catch, self).__init__()
        
        self.action_space = gym.spaces.Discrete(3)
        self.observation_space = gym.spaces.MultiBinary(grid_size*grid_size)
        
        self.grid_size = grid_size
        self.basket_size = basket_size

    def reset(self):           # Reset the state of the environment to an initial state
        n = np.random.randint(0, self.grid_size-1)                # starting fruit_col
        m = np.random.randint(0, self.grid_size-self.basket_size)  # starting basket col
        self.state = np.asarray([0, n, m])                        # [fruit_row, fruit_col, basket's left end]
        return self._observe()
    
    def step(self, action):    # Execute one time step within the environment
        if action == 0:   action = -1 # move left
        elif action == 1: action =  0 # stay
        else:             action =  1 # move right
        f0, f1, basket_left = self.state
        new_basket_left = min(max(0, basket_left + action), self.grid_size-self.basket_size)
        f0 += 1                       # fruit dropped by one pixel
        out = np.asarray([f0, f1, new_basket_left])
        self.state = out
        
        return self._observe(), self._get_reward(), self._is_over(), None # returns whole canvas, R, done?
    
    def render(self, mode='human', close=False):        # Render the environment to the screen
        print(state)
    #############################helper methods---not required by gym.env##################################
    def _get_reward(self):   # inc/dec score only if fruit has dropped to bottom
        fruit_row, fruit_col, basket_left = self.state
        return fruit_row == self.grid_size-1 and basket_left <= fruit_col < basket_left+self.basket_size

    def _is_over(self):    # game over if fruit dropped to bottom
        return (self.state[0] == self.grid_size-1)
    
    def _observe(self):
        im_size = (self.grid_size, self.grid_size)
        state = self.state
        canvas = np.zeros(im_size)
        canvas[self.state[0], self.state[1]] = 1                          # draw fruit
        canvas[-1, self.state[2]:self.state[2] + self.basket_size+1] = 1  # draw basket
        return canvas.flatten()

## Various Models

In [71]:
env = Catch(grid_size = 10)
agent = DQN(env)
agent.run(2000,fit_data='per_episode')

Episode  200 | Average R   0.21 | Median R 0.0
Episode  400 | Average R   0.33 | Median R 0.0
Episode  600 | Average R   0.37 | Median R 0.0
Episode  800 | Average R  0.465 | Median R 0.0
Episode 1000 | Average R  0.655 | Median R 1.0
Episode 1200 | Average R   0.79 | Median R 1.0
Episode 1400 | Average R   0.76 | Median R 1.0
Episode 1600 | Average R  0.815 | Median R 1.0
Episode 1800 | Average R  0.815 | Median R 1.0
Episode 2000 | Average R   0.85 | Median R 1.0


In [75]:
env = Catch(grid_size = 10)
agent = PolicyNet(env)
agent.run(1000,fit_data='per_step')

Episode  100 | Average R   0.07 | Median R 0.0
Episode  200 | Average R   0.15 | Median R 0.0
Episode  300 | Average R   0.04 | Median R 0.0
Episode  400 | Average R   0.04 | Median R 0.0
Episode  500 | Average R    0.1 | Median R 0.0
Episode  600 | Average R   0.07 | Median R 0.0
Episode  700 | Average R   0.12 | Median R 0.0
Episode  800 | Average R    0.1 | Median R 0.0
Episode  900 | Average R   0.01 | Median R 0.0
Episode 1000 | Average R   0.04 | Median R 0.0


In [52]:
env = Catch(grid_size = 10)
agent = ActorCritic(env, state_dim = 100, action_dim = 3)
agent.run(2000,fit_data='per_episode')

Episode  200 | Average R   0.15 | Median R 0.0
Episode  400 | Average R  0.265 | Median R 0.0
Episode  600 | Average R  0.355 | Median R 0.0
Episode  800 | Average R  0.325 | Median R 0.0
Episode 1000 | Average R   0.36 | Median R 0.0
Episode 1200 | Average R   0.34 | Median R 0.0
Episode 1400 | Average R  0.375 | Median R 0.0
Episode 1600 | Average R  0.305 | Median R 0.0
Episode 1800 | Average R  0.355 | Median R 0.0
Episode 2000 | Average R  0.365 | Median R 0.0


In [49]:
## Evaluation (epsilon=0)
agent.run(100,fit_data=False)

Episode   10 | Average R      1 | Median R 1.0
Episode   20 | Average R      1 | Median R 1.0
Episode   30 | Average R      1 | Median R 1.0
Episode   40 | Average R      1 | Median R 1.0
Episode   50 | Average R      1 | Median R 1.0
Episode   60 | Average R      1 | Median R 1.0
Episode   70 | Average R      1 | Median R 1.0
Episode   80 | Average R      1 | Median R 1.0
Episode   90 | Average R      1 | Median R 1.0
Episode  100 | Average R      1 | Median R 1.0


## Visualization

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import animation, rc
from IPython.display import HTML
frames = []

for e in range(100):
    loss = 0.
    env.reset()
    done = False
    state = env.observe()
    frames.append(state.reshape(env.grid_size,env.grid_size))
    while not done:
        q = agent.model.predict(state[np.newaxis])  # q table at current state
        action = np.argmax(q[0])
        next_state, reward, done, _ = env.step(action)
        frames.append(next_state.reshape(env.grid_size,env.grid_size))
        state = next_state
# plt.imshow(frames[9],interpolation='none', cmap='gray')

In [None]:
#animation
fig, ax = plt.subplots()
im  = ax.imshow(np.random.random((env.grid_size,)*2),interpolation='none', cmap='gray')
def init():
#     im.set_array(np.random.random((grid_size,grid_size)))
    return (im,)
def animate(i):
    im.set_array(frames[i])
    return (im,)
anim = animation.FuncAnimation(fig, animate, init_func=init,
                               frames=len(frames), interval=50, blit=True)
HTML(anim.to_html5_video())

In [None]:
#save to files
for i in range(len(frames)):
    plt.imshow(frames[i],interpolation='none', cmap='gray')
    plt.savefig("%03d.png" % i)

# [CartPole](https://gym.openai.com/envs/CartPole-v0/)

## Network on default state representation
* https://gym.openai.com/evaluations/eval_EIcM1ZBnQW2LBaFN6FY65g/
* https://gym.openai.com/evaluations/eval_OeUSZwUcR2qSAqMmOE1UIw/

In [76]:
env = gym.make('CartPole-v0')#.unwrapped
agent = DQN(env)
agent.run(200,fit_data='per_step')

Episode   20 | Average R  10.75 | Median R 10.0
Episode   40 | Average R   39.1 | Median R 36.0
Episode   60 | Average R   81.4 | Median R 86.0
Episode   80 | Average R  125.5 | Median R 128.0
Episode  100 | Average R  136.6 | Median R 134.0
Episode  120 | Average R  145.3 | Median R 146.0
Episode  140 | Average R  146.1 | Median R 141.0
Episode  160 | Average R  161.9 | Median R 162.0
Episode  180 | Average R  135.5 | Median R 119.0
Episode  200 | Average R  133.1 | Median R 155.0


In [55]:
env = gym.make('CartPole-v0')
agent = PolicyNet(env)
agent.run(200,fit_data='per_step')

Episode   20 | Average R   54.3 | Median R 28.0
Episode   40 | Average R    197 | Median R 200.0
Episode   60 | Average R  117.8 | Median R 122.0
Episode   80 | Average R   48.3 | Median R 44.0
Episode  100 | Average R  75.85 | Median R 84.0
Episode  120 | Average R  129.3 | Median R 141.5
Episode  140 | Average R  135.2 | Median R 136.0
Episode  160 | Average R  105.7 | Median R 98.0
Episode  180 | Average R   70.1 | Median R 71.5
Episode  200 | Average R   60.6 | Median R 60.5


In [94]:
env = gym.make('CartPole-v0')
agent = ActorCritic(env)
agent.run(100,fit_data='per_step')

Episode   10 | Average R   13.1 | Median R 12.0
Episode   20 | Average R   51.4 | Median R 44.5
Episode   30 | Average R   44.4 | Median R 43.0
Episode   40 | Average R   36.2 | Median R 38.0
Episode   50 | Average R  156.9 | Median R 180.5
Episode   60 | Average R    200 | Median R 200.0
Episode   70 | Average R    200 | Median R 200.0
Episode   80 | Average R  161.4 | Median R 146.0
Episode   90 | Average R  120.1 | Median R 120.5
Episode  100 | Average R  123.7 | Median R 120.0


In [80]:
# evaluation
agent.run(100,fit_data=None)

Episode   10 | Average R  138.3 | Median R 138.5
Episode   20 | Average R  137.4 | Median R 139.0
Episode   30 | Average R  135.8 | Median R 137.5
Episode   40 | Average R  137.1 | Median R 137.0
Episode   50 | Average R  137.1 | Median R 138.0
Episode   60 | Average R  138.1 | Median R 140.0
Episode   70 | Average R  138.6 | Median R 137.5
Episode   80 | Average R    139 | Median R 137.5
Episode   90 | Average R  138.7 | Median R 138.5
Episode  100 | Average R    135 | Median R 134.0


## CNN value network on raw pixels
https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html

network input: `state (raw pixels)` output: `Q(a|state)`