In [669]:
import numpy as np
import pickle
import gym
import gc
from torch.utils.tensorboard import SummaryWriter
from typing import List, Tuple, Dict, Set, Union

gc.enable()
gc.collect()

0

In [653]:
# HELPER FUNCTIONS

def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

def prepro(I):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
    I = I[35:195] # crop
    I = I[::2,::2,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return I.astype('float').ravel()

In [687]:
# MODEL INITIALIZATION

# Hyperparams
H = 200 # n hidden layer neurons
batch_size = 16 # n episodes before param update
learning_rate = 1e-3
gamma = 0.99 # discount factor
b1 = 0.9
b2 = 0.999
epsilon = 1e-7
resume = False # resume from previous checkpoint?
path = ''
render = False

# Model initialization
D = 80 * 80 # input dimensionality: 80x80 grid
if resume:
    model = pickle.load(open(f'checkpoints/{path}.p','rb'))
else:
    model = {}
    model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization, centers around 0
    model['W2'] = np.random.randn(H) / np.sqrt(H)

# Tensorboard logging
# Writer outputs to ./runs/
writer = SummaryWriter('exp5')
def log_reward(episode,r):
  writer.add_scalar('reward:',r,episode)

# Tracks gradients, 1st moment, and 2nd moment over a batch
gradient_buffer = { k : np.zeros_like(v) for k,v in model.items() }
m = { k : np.zeros_like(v) for k,v in model.items() }
c = { k : np.zeros_like(v) for k,v in model.items() }

In [688]:
# PG FUNCTIONS

def discount_rewards(r):
  """ take 1D float array of rewards over episode and compute discounted reward """
  discounted_r = np.zeros_like(r)
  G = 0

  # Working backwards from the terminal state
  for t in reversed(range(0, r.size)):
    # (Loosely) update value at each state
    G = gamma * G + r[t]
    discounted_r[t] = G

  return discounted_r

def policy_forward(x):
  """ given frame, return probability of action 2 """
  # Layer 1
  h = np.dot(model['W1'], x)
  # ReLU
  h[h<0] = 0
  # Layer 2: get logits
  logp = np.dot(model['W2'], h)
  # Sigmoid
  p = sigmoid(logp)

  return p, h # Return probability of action 2, hidden state

def policy_backward(h, x, pgrad):
  """
  backward pass that gets policy gradients
  
  h: hidden states
  x: observed states
  pgrad: policy gradients

  """
  # Second layer gradients
  dW2 = np.dot(h.T, pgrad).ravel()
  dh = np.outer(pgrad, model['W2'])
  dh[h <= 0] = 0

  # First layer gradients
  dW1 = np.dot(dh.T, x)
  return {'W1':dW1, 'W2':dW2}

In [689]:
# INITIALIZE GYM

env = gym.make("Pong-v4")
observation, info = env.reset()

prev_x = None # Used for differencing
xs,hs,pgrads,rs = [],[],[],[] # Observed, hidden, policy gradient, rewards
xsp,hsp,pgradsp,rsp = [],[],[],[] # For positive episodes
running_reward = -21
reward_sum_neg = 0
reward_sum_pos = 0
episode_number = 0

In [690]:
# Define gradient update
def gradient_update(rs: List,
                    xs: List,
                    hs: List,
                    pgrads: List,
                    gradient_buffer: Dict):

    # Stack negative intermediaries
    ixs = np.vstack(xs)
    ihs = np.vstack(hs)
    ipgrads = np.vstack(pgrads)
    irs = np.vstack(rs)

    # Get discounted rewards
    discounted_r = discount_rewards(irs)
    # Normalize rewards
    discounted_r -= np.mean(discounted_r)
    discounted_r /= np.std(discounted_r)

    # Calculate gradients (using Advantage and Policy Gradients)
    ipgrads *= discounted_r
    grad = policy_backward(ihs, ixs, ipgrads)

    # Accumulate gradients over batch
    for k in model:
      gradient_buffer[k] += grad[k]
    
    return gradient_buffer

In [691]:
while True:
  if render: env.render()

  # Preprocess and difference observation
  current_x = prepro(observation)
  x = current_x - prev_x if prev_x is not None else np.zeros(D)
  prev_x = current_x

  # Feed forward policy network
  aprob, h = policy_forward(x)
  # Sample action from returned probability
  action = 2 if np.random.uniform() < aprob else 3

  # Take a step
  observation, reward, terminated, truncated, info = env.step(action)
  y = 1 if action == 2 else 0 # "fake label"
  
  # Record intermediates based on reward value
  if running_reward==None or reward < running_reward:
    reward_sum_neg += reward
    rs.append(reward)
    xs.append(x) # observation
    hs.append(h) # hidden state
    pgrads.append(y - aprob) # policy gradient
  else:
    reward_sum_pos += reward
    rsp.append(reward)
    xsp.append(x)
    hsp.append(h)
    pgradsp.append(y - aprob)

  # If episode terminated
  if terminated or truncated:
    episode_number += 1

    # If xs is not empty
    if xs:
      gradient_buffer = gradient_update(rs, xs, hs, pgrads, gradient_buffer)
      # Reset to empty
      xs,hs,pgrads,rs = [],[],[],[]
    
    if xsp:
      gradient_buffer = gradient_update(rsp, xsp, hsp, pgradsp, gradient_buffer)
      xsp,hsp,pgradsp,rsp = [],[],[],[]

    # At end of batch: update model
    if episode_number % batch_size == 0:
      # For each layer of weights
      for k,v in model.items():
        # Get summed gradient
        g = gradient_buffer[k]

        # ADAM optimizer
        m[k] = b1 * m[k] + (1 - b1) * g
        c[k] = b2 * c[k] + (1 - b2) * g**2

        m_hat = m[k] / (1 - b1**episode_number)
        c_hat = c[k] / (1 - b2**episode_number)

        # Update model
        model[k] += learning_rate * m_hat / (np.sqrt(c_hat) + epsilon)

        # Reset gradient buffer
        gradient_buffer[k] = np.zeros_like(v)

    # Book-keeping
    reward_sum = reward_sum_pos + reward_sum_neg
    running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
    print('resetting env. episode reward total was %.2f. running mean: %.2f' % (reward_sum, running_reward))
    # Log reward
    log_reward(episode_number, running_reward)
    # Save model at checkpoints
    if episode_number % 500 == 0:
      pickle.dump(model, open(f'checkpoints/save_exp5{episode_number}.p', 'wb'))

    # Reset episode after temination
    reward_sum_neg = 0
    reward_sum_pos = 0
    prev_x = None
    observation, info = env.reset()

resetting env. episode reward total was -21.00. running mean: -21.00
resetting env. episode reward total was -21.00. running mean: -21.00
resetting env. episode reward total was -20.00. running mean: -20.99
resetting env. episode reward total was -21.00. running mean: -20.99
resetting env. episode reward total was -21.00. running mean: -20.99
resetting env. episode reward total was -20.00. running mean: -20.98
resetting env. episode reward total was -21.00. running mean: -20.98
resetting env. episode reward total was -21.00. running mean: -20.98
resetting env. episode reward total was -21.00. running mean: -20.98
resetting env. episode reward total was -21.00. running mean: -20.98
resetting env. episode reward total was -19.00. running mean: -20.96
resetting env. episode reward total was -20.00. running mean: -20.95
resetting env. episode reward total was -21.00. running mean: -20.95
resetting env. episode reward total was -21.00. running mean: -20.95
resetting env. episode reward tota