In [None]:
import gym
import tensorflow as tf
import tensorflow.keras as tfk
import numpy as np

In [None]:
class Agent(tfk.Model):
    
    def __init__(self, env, hidden_sizes=[16], act='relu', model_to_copy=None):
        super(Agent, self).__init__()
        
        self.fc_layers = []
        for size in hidden_sizes:
            self.fc_layers.append(tfk.layers.Dense(size, activation=act))
        # Final layer to produce Q(s, :) estimates
        self.fc_layers.append(tfk.layers.Dense(env.action_space.n, activation=None))
        
        # Run this to initialize weights
        self(np.zeros(shape=(1,) + env.observation_space.shape, dtype=np.float32))
        
        if model_to_copy is not None:
            for l_this, l_other in zip(self.layers, model_to_copy.layers):
                w0 = l_other.get_weights()
                l_this.set_weights(w0)
    
    def soft_update_weights(self, other, gamma=0.75):
        for l_this, l_other in zip(self.layers, other.layers):
            w_other = l_other.get_weights()
            w_this = l_this.get_weights()
            # Iterator in here is because w_other and w_this are *lists* of weights at each layer
            l_this.set_weights([ww_other*(1.0 - gamma) + gamma*ww_this
                                for ww_other, ww_this in zip(w_other, w_this)])
    
    def call(self, x):
        for layer in self.fc_layers:
            x = layer(x)
        
        return x

Need to be able to copy weights between models - think this may tell me how to do so: https://medium.com/randomai/model-surgery-copy-weights-from-model-to-model-a31b1dec7a7a

In [None]:
def generate_episode(env, agent, explore_eps=0.2, max_steps=200):
    """
    Runs the agent purely in forward mode, and uses its actions to step the env.
    """

    s_hist = []
    a_hist = []
    r_hist = []
    d_hist = []
    
    qa_max_hist = []
    
    s = env.reset()
    
    for step in range(max_steps):
        
        q_estimates = agent(np.array([s], dtype=np.float32))
        
        a_max = np.argmax(q_estimates, axis=1)[0]
        
        a_selected = a_max if (np.random.uniform() > explore_eps) \
                           else np.random.choice(env.action_space.n)
        
        s_next, r, done, _ = env.step(a_max)
        
        s_hist.append(s)
        a_hist.append(a_selected)
        r_hist.append(r)
        d_hist.append(done)
        
        qa_max = q_estimates[0, a_max]
        qa_max_hist.append(qa_max)
        
        if done:
            s_hist.append(s_next) # Need this in the thing we return
            break

    ep_reward = sum(r_hist)
    
    # Package up into (s, a, r, s_next) tuples.
    # These are exactly what we need to iteratively impose the Bellman equation.
    # We drop the final state - this would be a problem in a game where the reward was all in the final step.
    # I don't think that it should be a problem for CartPole.
    return list(zip(s_hist[:-1], a_hist, r_hist, s_hist[1:], d_hist)), ep_reward, qa_max_hist

In [None]:
def train_agent(agent, target_agent, optimizer, experiences, gamma=0.99):
    
    states = np.array([s for s, _, _, _, _ in experiences], dtype=np.float32)
    next_states = np.array([ns for _, _, _, ns, _ in experiences], dtype=np.float32)
    actions = np.array([a for _, a, _, _, _ in experiences], dtype=np.int64)
    rewards = np.array([r for _, _, r, _, _ in experiences], dtype=np.float32)
    dones = np.array([d for _, _, _, _, d in experiences], dtype=np.bool)
       
    # Recall that our agent outputs Q(s, :), with a neuron for each action.
    # We must take the max over these to get the RHS of the Bellman equation.
    target_qs = tf.reduce_max(target_agent(next_states), axis=1)
    targets = rewards + gamma * target_qs * ~dones
    
    print(list(zip(dones, [t.numpy() for t in targets])))
    
    # Now we have our targets, we can train our agent towards them
    with tf.GradientTape() as tape:
        agent_qs_all_a = agent(states)
        n = agent_qs_all_a.shape[1]
        actions_oh = tf.one_hot(indices=actions, depth=n, on_value=True, off_value=False, dtype=tf.bool)
        agent_qs = agent(states)[actions_oh]
        loss = tf.losses.mean_squared_error(targets, agent_qs)
    gradients = tape.gradient(loss, agent.trainable_variables)
    optimizer.apply_gradients(zip(gradients, agent.trainable_variables))

In [None]:
env = gym.make('CartPole-v0')
agent = Agent(env)

target_agent = Agent(env, model_to_copy=agent)
optimizer = tf.keras.optimizers.SGD()

# Dumb option - just keep all of the experience ever
experiences = []

In [None]:
episodes_per_train = 1
num_training_steps = 1000

batch_size = 128

In [None]:
# Tracking of performance
class EMA:
    def __init__(self, gamma=0.99):
        self._x = None
        self._gamma = gamma
    
    def update(self, x):
        if self._x is None:
            self._x = x
        else:
            self._x *= self._gamma
            self._x += (1 - self._gamma) * x
    
    def get(self):
        return self._x
    
ema_reward = EMA()

In [None]:
for step in range(num_training_steps):
    # Generate some new episodes
    for _ in range(episodes_per_train):
        new_experiences, ep_reward, qa_max_hist = generate_episode(env, agent)
        
        #print([r for _, _, r, _ in new_experiences])
        
        #print([a for _, a, _, _ in new_experiences])
        #print([t.numpy() for t in qa_max_hist])
        
        experiences += new_experiences
        ema_reward.update(ep_reward)
    
    this_batch_size = min(len(experiences), batch_size)
    exp_idx = np.random.choice(len(experiences), replace=False, size=this_batch_size)
    experiences_this_batch = [experiences[i] for i in exp_idx]
    
    train_agent(agent, target_agent, optimizer, experiences_this_batch)
    target_agent.soft_update_weights(agent)
    
    if step % 10 == 0:
        print(f'Step: {step}   EMA reward: {ema_reward.get()}')