In [1]:
import os
import gym
import pickle
import random
import numpy as np
import multiprocessing as mp
import matplotlib.pyplot as plt

%matplotlib inline
np.random.seed(0)

print(np.__version__)
print(gym.__version__)

1.16.2
0.12.0


In [2]:
def worker_process(arg):
  get_reward_func, weights = arg
  
  return get_reward_func(weights)


class EvolutionStrategy(object):
  def __init__(self, weights,
               get_reward_func,
               population_size=20,
               sigma=0.1,
               learning_rate=0.01,
               decay=0.999,
               num_threads=1,
               checkpoints=False):

    self.weights = weights
    self.get_reward = get_reward_func
    
    self.POPULATION_SIZE = population_size
    self.SIGMA = sigma
    
    self.learning_rate = learning_rate
    self.decay = decay
    
    self.num_threads = mp.cpu_count() if num_threads == -1 else num_threads
    self.output_dir = 'output/'
    self.checkpoints = checkpoints

  def _get_weights_try(self, w, p):
    weights_try = []
    
    for index, i in enumerate(p):
      # w_try = w + sigma * N[j]
      weights_try.append(w[index] + self.SIGMA * i)

    return weights_try

  
  def get_weights(self):
    return self.weights

  
  def _get_population(self):
    population = []
    
    for _ in range(self.POPULATION_SIZE):
      x = []
      for w in self.weights:
        x.append(np.random.randn(*w.shape))
      population.append(x)
    
    return population
  
  
  def _get_rewards(self, pool, population):
    if pool is not None:
      worker_args = ((self.get_reward, self._get_weights_try(self.weights, p)) for p in population)
      rewards = pool.map(worker_process, worker_args)
    else:
      # R = np.zeros(npop)
      rewards = np.zeros(self.POPULATION_SIZE)
      for i, p in enumerate(population):
        # w_try
        weights_try = self._get_weights_try(self.weights, p)
        # R[j] = f(w_try)
        rewards[i] = self.get_reward(weights_try)

    return np.array(rewards)


  def _update_weights(self, rewards, population):
    # np.std(R)
    std = rewards.std()
    
    if std == 0:
      return
    
    # A = (R - np.mean(R)) / np.std(R)
    rewards = (rewards - rewards.mean()) / std
    
    for i, w in enumerate(self.weights):
      # from list to array
      layer_population = np.array([p[i] for p in population])
      
      # alpha/(npop*sigma)
      update_factor = self.learning_rate / (self.POPULATION_SIZE * self.SIGMA)
      
      # np.dot(N.T, A)
      pop_rewards = np.dot(layer_population.T, rewards).T
      
      # w = w + alpha/(npop*sigma) * np.dot(N.T, A)
      self.weights[i] = w + update_factor * pop_rewards
    
    self.learning_rate *= self.decay

  def run(self, iterations, print_step=10):
    pool = mp.Pool(self.num_threads) if self.num_threads > 1 else None
    results = []
    
    for iteration in range(iterations):
      # N = np.random.randn(npop, 3)
      population = self._get_population()
      
      # for j in range(npop):
      rewards = self._get_rewards(pool, population)
      
      curr_reward = self.get_reward(self.weights)

      self._update_weights(rewards, population)
      results.append(curr_reward)

      if (iteration + 1) % print_step == 0:
        print('iter %d. reward: %f' % (iteration + 1, curr_reward))
        
      if self.checkpoints and (iteration + 1) % 50 == 0:
        print('Saving checkpoint...')
        
        filename = 'bipedal-es_ep{}.p'.format((iteration + 1))
        file_path = os.path.join(self.output_dir, filename)
        pickle.dump(self.weights, open(file_path, 'wb'))
        
        filename = 'bipedal-es_results{}.p'.format((iteration + 1))
        file_path = os.path.join(self.output_dir, filename)
        pickle.dump(results, open(file_path, 'wb'))

    if pool is not None:
      pool.close()
      pool.join()

In [3]:
class Model(object):
  
  def __init__(self, obs_shape, act_shape, layer_shape=16, dtype=np.float64):
    self.weights = [
        np.zeros(shape=(obs_shape, layer_shape), dtype=dtype),
        np.zeros(shape=(layer_shape, layer_shape), dtype=dtype),
        np.zeros(shape=(layer_shape, act_shape), dtype=dtype)
    ]


  def predict(self, inp):
    out = np.expand_dims(inp.flatten(), 0)
    out = out / np.linalg.norm(out)
    
    for layer in self.weights:
      out = np.dot(out, layer)
    
    return out[0]

  def get_weights(self):
    return self.weights

  def set_weights(self, weights):
    self.weights = weights

In [4]:
class Agent:

  AGENT_HISTORY_LENGTH = 1
  POPULATION_SIZE = 32
  EPS_AVG = 1
  SIGMA = 0.1
  LEARNING_RATE = 0.01
  INITIAL_EXPLORATION = 1.0
  FINAL_EXPLORATION = 0.0
  EXPLORATION_DEC_STEPS = 1000000

  def __init__(self):
    self.env = gym.make('BipedalWalkerHardcore-v2')
    
    self.model = Model(24, 4, 78)
    
    self.es = EvolutionStrategy(
        self.model.get_weights(),
        self.get_reward,
        self.POPULATION_SIZE,
        self.SIGMA,
        self.LEARNING_RATE,
        checkpoints=True
    )
    
    self.exploration = self.INITIAL_EXPLORATION
    
    self.output_dir = 'output/'
    
    
  def summary(self):
    print('Population size:', self.POPULATION_SIZE)
    print('Learning rate:', self.LEARNING_RATE)
    print('Sigma:', self.SIGMA)
    print('Layers:', len(self.model.get_weights()))


  def get_predicted_action(self, sequence):
    prediction = self.model.predict(np.array(sequence))
    
    return prediction


  def load(self, filename='weights.p'):
    file_path = os.path.join(self.output_dir, filename)
    
    self.model.set_weights(pickle.load(open(file_path,'rb')))
    self.es.weights = self.model.get_weights()


  def save(self, filename='weights.p'):
    file_path = os.path.join(self.output_dir, filename)
    
    pickle.dump(self.es.get_weights(), open(file_path, 'wb'))


  def play(self, episodes, render=True):
    self.model.set_weights(self.es.weights)
    # self.env = gym.wrappers.Monitor(self.env, 'video', force=True)

    for episode in range(episodes):
      total_reward = 0
      observation = self.env.reset()
      sequence = [observation] * self.AGENT_HISTORY_LENGTH
      done = False

      while not done:
        if render:
          self.env.render()
        action = self.get_predicted_action(sequence)
        
        observation, reward, done, _ = self.env.step(action)
        observation = observation
        
        total_reward += reward
        sequence = sequence[1:]
        sequence.append(observation)

      print("Total reward:", total_reward)


  def train(self, iterations):
    self.es.run(iterations, print_step=1)


  # f(w)
  def get_reward(self, weights):
    total_reward = 0.0
    self.model.set_weights(weights)

    for episode in range(self.EPS_AVG):
      observation = self.env.reset()
      sequence = [observation] * self.AGENT_HISTORY_LENGTH
      done = False
      
      while not done:
        self.exploration = max(
            self.FINAL_EXPLORATION,
            self.exploration - self.INITIAL_EXPLORATION / self.EXPLORATION_DEC_STEPS
        )
        
        if random.random() < self.exploration:
          action = self.env.action_space.sample()
        else:
          action = self.get_predicted_action(sequence)
        
        observation, reward, done, _ = self.env.step(action)
        observation = observation
        
        total_reward += reward
        sequence = sequence[1:]
        sequence.append(observation)

    return total_reward / self.EPS_AVG

In [5]:
agent = Agent()
agent.load('bipedal-es_ep350.p')
agent.play(10)

Total reward: -91.28967385955082
Total reward: -120.7362008030654
Total reward: -151.93435643107284
Total reward: -41.580282757850085
Total reward: -116.03944843151905
Total reward: -92.73282820205053
Total reward: -145.8591758997851
Total reward: -114.58805023258078
Total reward: -111.15519262823149
Total reward: -79.01692742220702
