In [2]:
'''
author: Thyrix Yang
github: https://github.com/ThyrixYang
'''
import time
start_time = time.time()

import numpy as np 
import gym
scores = []

class CEMOptimizer:

  def __init__(self, weights_dim, batch_size=1000, deviation=10, deviation_lim=100, rho=0.1, eta=0.1, mean=None):
    self.rho = rho
    self.eta = eta
    self.weights_dim = weights_dim
    self.mean = mean if mean!=None else np.zeros(weights_dim)
    self.deviation = np.full(weights_dim, deviation)
    self.batch_size = batch_size
    self.select_num = int(batch_size * rho)
    self.deviation_lim = deviation_lim

    assert(self.select_num > 0)

  def update_weights(self, weights, rewards):
    rewards = np.array(rewards).flatten()
    weights = np.array(weights)
    sorted_idx = (-rewards).argsort()[:self.select_num]
    top_weights = weights[sorted_idx]
    top_weights = np.reshape(top_weights, (self.select_num, self.weights_dim))
    self.mean = np.sum(top_weights, axis=0) / self.select_num
    self.deviation = np.std(top_weights, axis=0)
    self.deviation[self.deviation > self.deviation_lim] = self.deviation_lim
    if(len(self.deviation)!=self.weights_dim):
      print("dim error")
      print(len(self.deviation))
      print(self.weights_dim)
      exit()

    #method to create random weights 
  def sample_batch_weights(self):
    return [np.random.normal(self.mean, self.deviation * (1 + self.eta)) \
        for _ in range(self.batch_size)]

  def get_weights(self):
    return self.mean



def train():

  def select_action(ob, weights):
    b1 = np.reshape(weights[0], (1, 1))
    w1 = np.reshape(weights[1:4], (1, 3))
    b2 = np.reshape(weights[4:7], (3, 1))
    w2 = np.reshape(weights[7:16], (3, 3))
    w3 = np.reshape(weights[16:25], (3, 3))
    b3 = np.reshape(weights[25:], (3, 1))
    ob = np.reshape(ob, (3, 1))
    action = np.dot(w1, np.tanh(np.dot(w2, np.tanh(np.dot(w3, ob) - b3)) - b2)) - b1
    return np.tanh(action) * 2

  #set up environment
  opt = CEMOptimizer(3*3+3*3+3*1+3*1+3*1+1, 500, rho=0.01, eta=0.3, deviation=10, deviation_lim=20)
  env = gym.make("Pendulum-v0")
  env = gym.wrappers.Monitor(env, '/tmp/cartpole-experiment-3', force=True)
  epoch = 80
  run_times = 10

  def test():
    W = opt.get_weights()
    observation = env.reset()
    accreward = 0
    while True:
      env.render()
      action = select_action(observation, W)
      observation, reward, done, info = env.step(action)
      accreward += reward
      if done:
        print("test end with reward: {}".format(accreward))
        scores.append(accreward)
        #print(scores)
        break

  for ep in range(epoch):
    print("start epoch {}".format(ep))
    weights = opt.sample_batch_weights() #randomize weights
    rewards = []
    opt.eta *= 0.99
    print("deviation mean = {}".format(np.mean(opt.deviation)))
    for b in range(opt.batch_size):
      accreward = 0
      for _ in range(run_times):  
        observation = env.reset()  
        while True: #action updated continuously
          action = select_action(observation, weights[b])
          observation, reward, done, info = env.step(action)
          accreward += reward
          if done:
            break
      rewards.append(accreward)
    opt.update_weights(weights, rewards)
    test()

if __name__ == '__main__':
  train()
  print("--- %s seconds ---" % (time.time() - start_time))

start epoch 0
deviation mean = 10.0
test end with reward: [-1499.18663864]
start epoch 1
deviation mean = 11.348508533430941
test end with reward: [-1656.53788569]
start epoch 2
deviation mean = 10.5154989854672
test end with reward: [-895.16819746]
start epoch 3
deviation mean = 11.458376403298658
test end with reward: [-138.65170979]
start epoch 4
deviation mean = 10.360312620184294
test end with reward: [-1501.06955974]
start epoch 5
deviation mean = 11.473431544328815
test end with reward: [-137.84779373]
start epoch 6
deviation mean = 10.053089447499207
test end with reward: [-132.48289306]
start epoch 7
deviation mean = 8.817873390205383
test end with reward: [-1505.34641595]
start epoch 8
deviation mean = 8.652870869354295
test end with reward: [-954.53800206]
start epoch 9
deviation mean = 7.913195826309426
test end with reward: [-248.93505109]
start epoch 10
deviation mean = 7.570025969332937
test end with reward: [-362.66624103]
start epoch 11
deviation mean = 6.6638099397549

In [3]:
cleaned = [float(i) for i in scores]
index = [i for i in range(0,len(cleaned))]

In [4]:
import pandas as pd

In [5]:
import seaborn as sns

In [6]:
dataframe = pd.DataFrame({
    'Index': index,
    'Reward':cleaned
})

print(dataframe)

    Index       Reward
0       0 -1499.186639
1       1 -1656.537886
2       2  -895.168197
3       3  -138.651710
4       4 -1501.069560
5       5  -137.847794
6       6  -132.482893
7       7 -1505.346416
8       8  -954.538002
9       9  -248.935051
10     10  -362.666241
11     11  -134.847835
12     12    -1.751526
13     13  -120.908998
14     14  -335.817288
15     15  -355.194976
16     16 -1321.821749
17     17    -2.914542
18     18  -471.743887
19     19  -606.490512
20     20  -423.656681
21     21  -466.927206
22     22  -121.705615
23     23  -129.155993
24     24  -370.096897
25     25  -357.865151
26     26  -245.319224
27     27  -127.812961
28     28  -218.620273
29     29    -2.576680
..    ...          ...
50     50  -124.034787
51     51  -248.546875
52     52  -124.044754
53     53  -409.743033
54     54  -117.507900
55     55  -125.693206
56     56  -431.336444
57     57    -1.518506
58     58  -131.689177
59     59  -223.680791
60     60  -123.126107
61     61  

In [7]:
from bokeh.io import output_file, output_notebook, show
from bokeh.plotting import figure

# create the plot
p = figure(plot_width=900, plot_height=600,
          x_axis_label='Epoch Number',
           y_axis_label='Reward at End of Test',
          title='Performance of Cross-Entropy Method')

# add multiple line glyph
p.line(x=dataframe['Index'], y=dataframe['Reward'], 
       color='black', legend='Reward', line_width=5)

# use this next line to instead output the plot in the jupyter notebook
#output_notebook()
output_file("Cross_Entropy_Method")

# show the plot
show(p)