In [1]:
import gym
import numpy as np
rm='Pong-v4'

In [2]:
%matplotlib inline
from matplotlib import animation
import matplotlib.pyplot as plt
from IPython.display import display, HTML

def display_frames_as_gif(frames):
    plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 144)
    patch = plt.imshow(frames[0])
    plt.axis('off')
    def animate(i):
        patch.set_data(frames[i])
    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    plt.close(anim._fig)
    display(HTML(anim.to_jshtml()))

In [3]:
gym.new_step_api=True
env = gym.make(rm)
# model initialization
H = 800 # number of hidden layer neurons
D = 80 * 80 # input dimensionality: 80x80 grid
model = {}
model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
model['W2'] = np.random.randn(H) / np.sqrt(H)
# hyperparameters
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-4
#learning_rate = 1e-3
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
grad_buffer = { k : np.zeros_like(v) for k,v in model.items() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.items() } # rmsprop memory

def sigmoid(x): 
  return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]

def prepro(I):
  """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
  I = I[35:195] # crop
  I = I[::2,::2,0] # downsample by factor of 2
  I[I == 144] = 0 # erase background (background type 1)
  I[I == 109] = 0 # erase background (background type 2)
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  return I.astype(float).ravel()

def discount_rewards(r):
  """ take 1D float array of rewards and compute discounted reward """
  discounted_r = np.zeros_like(r)
  running_add = 0
  for t in reversed(range(0, r.size)):
    if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
    running_add = running_add * gamma + r[t]
    discounted_r[t] = running_add
  return discounted_r

def policy_forward(x):
  h = np.dot(model['W1'], x)
  h[h<0] = 0 # ReLU nonlinearity
  logp = np.dot(model['W2'], h)
  p = sigmoid(logp)
  return p, h # return probability of taking action 2, and hidden state

def policy_backward(epx, eph, epdlogp):
  """ backward pass. (eph is array of intermediate hidden states) """
  dW2 = np.dot(eph.T, epdlogp).ravel()
  dh = np.outer(epdlogp, model['W2'])
  dh[eph <= 0] = 0 # backpro prelu
  dW1 = np.dot(dh.T, epx)
  return {'W1':dW1, 'W2':dW2}

def model_step(model, observation, prev_x):
  # preprocess the observation, set input to network to be difference image
  cur_x = prepro(observation)
  x = cur_x - prev_x if prev_x is not None else np.zeros(D)
  prev_x = cur_x
  
  # forward the policy network and sample an action from the returned probability
  aprob, _ = policy_forward(x)
  action = 2 if aprob >= 0.5 else 3 # roll the dice!
  
  return action, prev_x

def play_game(env, model):
  observation = env.reset()

  frames = []
  cumulated_reward = 0

  prev_x = None # used in computing the difference frame

  for t in range(1000):
      frames.append(env.render(mode = 'rgb_array'))
      action, prev_x = model_step(model, observation, prev_x)
      observation, reward, done, info = env.step(action)
      cumulated_reward += reward
      if done:
          print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))
          break
  print("Episode finished without success, accumulated reward = {}".format(cumulated_reward))
  env.close()
  display_frames_as_gif(frames)

def train_model(env, model, total_episodes = 100):
  hist = []
  observation = env.reset()

  prev_x = None # used in computing the difference frame
  xs,hs,dlogps,drs = [],[],[],[]
  running_reward = None
  reward_sum = 0
  episode_number = 0

  while True:
    # preprocess the observation, set input to network to be difference image
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    prev_x = cur_x

    # forward the policy network and sample an action from the returned probability
    aprob, h = policy_forward(x)
    action = 2 if np.random.uniform() < aprob else 3 # roll the dice!

    # record various intermediates (needed later for backprop)
    xs.append(x) # observation
    hs.append(h) # hidden state
    y = 1 if action == 2 else 0 # a "fake label"
    dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

    if done: # an episode finished
      episode_number += 1

      # stack together all inputs, hidden states, action gradients, and rewards for this episode
      epx = np.vstack(xs)
      eph = np.vstack(hs)
      epdlogp = np.vstack(dlogps)
      epr = np.vstack(drs)
      xs,hs,dlogps,drs = [],[],[],[] # reset array memory

      # compute the discounted reward backwards through time
      discounted_epr = discount_rewards(epr)
      # standardize the rewards to be unit normal (helps control the gradient estimator variance)
      discounted_epr -= np.mean(discounted_epr)
      discounted_epr /= np.std(discounted_epr)

      epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
      grad = policy_backward(epx, eph, epdlogp)
      for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

      # perform rmsprop parameter update every batch_size episodes
      if episode_number % batch_size == 0:
        for k,v in model.items():
          g = grad_buffer[k] # gradient
          rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
          model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
          grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

      # boring book-keeping
      running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
      hist.append((episode_number, reward_sum, running_reward))
      print ('episode %f, reward total was %f. running mean: %f' % (episode_number, reward_sum, running_reward))
      reward_sum = 0
      observation = env.reset() # reset env
      prev_x = None
      if episode_number == total_episodes: 
        return hist

    # if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
    #   print('ep {}: game finished, reward: {}'.format(episode_number, reward) + ('' if reward == -1 else ' !!!!!!!!'))

  deprecation(
  deprecation(


In [None]:
%time hist1 = train_model(env, model, total_episodes=6000)

  logger.deprecation(


episode 1.000000, reward total was -21.000000. running mean: -21.000000
episode 2.000000, reward total was -21.000000. running mean: -21.000000
episode 3.000000, reward total was -20.000000. running mean: -20.990000
episode 4.000000, reward total was -20.000000. running mean: -20.980100
episode 5.000000, reward total was -20.000000. running mean: -20.970299
episode 6.000000, reward total was -20.000000. running mean: -20.960596
episode 7.000000, reward total was -21.000000. running mean: -20.960990
episode 8.000000, reward total was -21.000000. running mean: -20.961380
episode 9.000000, reward total was -21.000000. running mean: -20.961766
episode 10.000000, reward total was -21.000000. running mean: -20.962149
episode 11.000000, reward total was -20.000000. running mean: -20.952527
episode 12.000000, reward total was -21.000000. running mean: -20.953002
episode 13.000000, reward total was -20.000000. running mean: -20.943472
episode 14.000000, reward total was -20.000000. running mean

episode 114.000000, reward total was -21.000000. running mean: -20.650190
episode 115.000000, reward total was -20.000000. running mean: -20.643688
episode 116.000000, reward total was -21.000000. running mean: -20.647251
episode 117.000000, reward total was -21.000000. running mean: -20.650779
episode 118.000000, reward total was -21.000000. running mean: -20.654271
episode 119.000000, reward total was -20.000000. running mean: -20.647728
episode 120.000000, reward total was -20.000000. running mean: -20.641251
episode 121.000000, reward total was -20.000000. running mean: -20.634838
episode 122.000000, reward total was -20.000000. running mean: -20.628490
episode 123.000000, reward total was -21.000000. running mean: -20.632205
episode 124.000000, reward total was -21.000000. running mean: -20.635883
episode 125.000000, reward total was -19.000000. running mean: -20.619524
episode 126.000000, reward total was -20.000000. running mean: -20.613329
episode 127.000000, reward total was -

episode 225.000000, reward total was -21.000000. running mean: -20.444428
episode 226.000000, reward total was -21.000000. running mean: -20.449984
episode 227.000000, reward total was -21.000000. running mean: -20.455484
episode 228.000000, reward total was -20.000000. running mean: -20.450929
episode 229.000000, reward total was -20.000000. running mean: -20.446420
episode 230.000000, reward total was -21.000000. running mean: -20.451955
episode 231.000000, reward total was -21.000000. running mean: -20.457436
episode 232.000000, reward total was -20.000000. running mean: -20.452862
episode 233.000000, reward total was -20.000000. running mean: -20.448333
episode 234.000000, reward total was -21.000000. running mean: -20.453850
episode 235.000000, reward total was -19.000000. running mean: -20.439311
episode 236.000000, reward total was -20.000000. running mean: -20.434918
episode 237.000000, reward total was -19.000000. running mean: -20.420569
episode 238.000000, reward total was -

episode 336.000000, reward total was -20.000000. running mean: -20.340075
episode 337.000000, reward total was -20.000000. running mean: -20.336674
episode 338.000000, reward total was -20.000000. running mean: -20.333307
episode 339.000000, reward total was -21.000000. running mean: -20.339974
episode 340.000000, reward total was -21.000000. running mean: -20.346574
episode 341.000000, reward total was -21.000000. running mean: -20.353109
episode 342.000000, reward total was -20.000000. running mean: -20.349577
episode 343.000000, reward total was -21.000000. running mean: -20.356082
episode 344.000000, reward total was -19.000000. running mean: -20.342521
episode 345.000000, reward total was -21.000000. running mean: -20.349096
episode 346.000000, reward total was -21.000000. running mean: -20.355605
episode 347.000000, reward total was -21.000000. running mean: -20.362049
episode 348.000000, reward total was -21.000000. running mean: -20.368428
episode 349.000000, reward total was -

episode 447.000000, reward total was -20.000000. running mean: -20.384485
episode 448.000000, reward total was -19.000000. running mean: -20.370640
episode 449.000000, reward total was -21.000000. running mean: -20.376934
episode 450.000000, reward total was -19.000000. running mean: -20.363165
episode 451.000000, reward total was -20.000000. running mean: -20.359533
episode 452.000000, reward total was -21.000000. running mean: -20.365938
episode 453.000000, reward total was -21.000000. running mean: -20.372278
episode 454.000000, reward total was -21.000000. running mean: -20.378555
episode 455.000000, reward total was -19.000000. running mean: -20.364770
episode 456.000000, reward total was -20.000000. running mean: -20.361122
episode 457.000000, reward total was -21.000000. running mean: -20.367511
episode 458.000000, reward total was -20.000000. running mean: -20.363836
episode 459.000000, reward total was -20.000000. running mean: -20.360198
episode 460.000000, reward total was -

episode 558.000000, reward total was -17.000000. running mean: -20.300485
episode 559.000000, reward total was -21.000000. running mean: -20.307480
episode 560.000000, reward total was -20.000000. running mean: -20.304406
episode 561.000000, reward total was -20.000000. running mean: -20.301362
episode 562.000000, reward total was -20.000000. running mean: -20.298348
episode 563.000000, reward total was -19.000000. running mean: -20.285364
episode 564.000000, reward total was -21.000000. running mean: -20.292511
episode 565.000000, reward total was -21.000000. running mean: -20.299586
episode 566.000000, reward total was -20.000000. running mean: -20.296590
episode 567.000000, reward total was -20.000000. running mean: -20.293624
episode 568.000000, reward total was -19.000000. running mean: -20.280688
episode 569.000000, reward total was -20.000000. running mean: -20.277881
episode 570.000000, reward total was -21.000000. running mean: -20.285102
episode 571.000000, reward total was -

episode 669.000000, reward total was -19.000000. running mean: -20.312255
episode 670.000000, reward total was -21.000000. running mean: -20.319133
episode 671.000000, reward total was -20.000000. running mean: -20.315942
episode 672.000000, reward total was -20.000000. running mean: -20.312782
episode 673.000000, reward total was -21.000000. running mean: -20.319654
episode 674.000000, reward total was -21.000000. running mean: -20.326458
episode 675.000000, reward total was -20.000000. running mean: -20.323193
episode 676.000000, reward total was -21.000000. running mean: -20.329961
episode 677.000000, reward total was -21.000000. running mean: -20.336662
episode 678.000000, reward total was -21.000000. running mean: -20.343295
episode 679.000000, reward total was -20.000000. running mean: -20.339862
episode 680.000000, reward total was -20.000000. running mean: -20.336463
episode 681.000000, reward total was -21.000000. running mean: -20.343099
episode 682.000000, reward total was -

episode 780.000000, reward total was -20.000000. running mean: -20.161399
episode 781.000000, reward total was -21.000000. running mean: -20.169785
episode 782.000000, reward total was -21.000000. running mean: -20.178087
episode 783.000000, reward total was -20.000000. running mean: -20.176306
episode 784.000000, reward total was -20.000000. running mean: -20.174543
episode 785.000000, reward total was -20.000000. running mean: -20.172798
episode 786.000000, reward total was -20.000000. running mean: -20.171070
episode 787.000000, reward total was -19.000000. running mean: -20.159359
episode 788.000000, reward total was -21.000000. running mean: -20.167765
episode 789.000000, reward total was -19.000000. running mean: -20.156088
episode 790.000000, reward total was -19.000000. running mean: -20.144527
episode 791.000000, reward total was -18.000000. running mean: -20.123082
episode 792.000000, reward total was -19.000000. running mean: -20.111851
episode 793.000000, reward total was -

episode 891.000000, reward total was -20.000000. running mean: -20.193812
episode 892.000000, reward total was -20.000000. running mean: -20.191874
episode 893.000000, reward total was -21.000000. running mean: -20.199955
episode 894.000000, reward total was -16.000000. running mean: -20.157956
episode 895.000000, reward total was -20.000000. running mean: -20.156376
episode 896.000000, reward total was -18.000000. running mean: -20.134812
episode 897.000000, reward total was -20.000000. running mean: -20.133464
episode 898.000000, reward total was -20.000000. running mean: -20.132129
episode 899.000000, reward total was -20.000000. running mean: -20.130808
episode 900.000000, reward total was -18.000000. running mean: -20.109500
episode 901.000000, reward total was -21.000000. running mean: -20.118405
episode 902.000000, reward total was -21.000000. running mean: -20.127221
episode 903.000000, reward total was -20.000000. running mean: -20.125949
episode 904.000000, reward total was -

episode 1002.000000, reward total was -20.000000. running mean: -20.109503
episode 1003.000000, reward total was -17.000000. running mean: -20.078408
episode 1004.000000, reward total was -20.000000. running mean: -20.077624
episode 1005.000000, reward total was -20.000000. running mean: -20.076847
episode 1006.000000, reward total was -21.000000. running mean: -20.086079
episode 1007.000000, reward total was -21.000000. running mean: -20.095218
episode 1008.000000, reward total was -21.000000. running mean: -20.104266
episode 1009.000000, reward total was -20.000000. running mean: -20.103223
episode 1010.000000, reward total was -20.000000. running mean: -20.102191
episode 1011.000000, reward total was -20.000000. running mean: -20.101169
episode 1012.000000, reward total was -20.000000. running mean: -20.100158
episode 1013.000000, reward total was -19.000000. running mean: -20.089156
episode 1014.000000, reward total was -21.000000. running mean: -20.098264
episode 1015.000000, rewa

episode 1112.000000, reward total was -21.000000. running mean: -20.160983
episode 1113.000000, reward total was -20.000000. running mean: -20.159374
episode 1114.000000, reward total was -21.000000. running mean: -20.167780
episode 1115.000000, reward total was -21.000000. running mean: -20.176102
episode 1116.000000, reward total was -21.000000. running mean: -20.184341
episode 1117.000000, reward total was -21.000000. running mean: -20.192498
episode 1118.000000, reward total was -19.000000. running mean: -20.180573
episode 1119.000000, reward total was -19.000000. running mean: -20.168767
episode 1120.000000, reward total was -19.000000. running mean: -20.157079
episode 1121.000000, reward total was -21.000000. running mean: -20.165508
episode 1122.000000, reward total was -18.000000. running mean: -20.143853
episode 1123.000000, reward total was -18.000000. running mean: -20.122415
episode 1124.000000, reward total was -20.000000. running mean: -20.121191
episode 1125.000000, rewa

episode 1222.000000, reward total was -21.000000. running mean: -20.105424
episode 1223.000000, reward total was -21.000000. running mean: -20.114370
episode 1224.000000, reward total was -20.000000. running mean: -20.113226
episode 1225.000000, reward total was -20.000000. running mean: -20.112094
episode 1226.000000, reward total was -20.000000. running mean: -20.110973
episode 1227.000000, reward total was -21.000000. running mean: -20.119863
episode 1228.000000, reward total was -20.000000. running mean: -20.118665
episode 1229.000000, reward total was -20.000000. running mean: -20.117478
episode 1230.000000, reward total was -20.000000. running mean: -20.116303
episode 1231.000000, reward total was -21.000000. running mean: -20.125140
episode 1232.000000, reward total was -19.000000. running mean: -20.113889
episode 1233.000000, reward total was -18.000000. running mean: -20.092750
episode 1234.000000, reward total was -20.000000. running mean: -20.091823
episode 1235.000000, rewa

episode 1332.000000, reward total was -21.000000. running mean: -20.134224
episode 1333.000000, reward total was -21.000000. running mean: -20.142882
episode 1334.000000, reward total was -18.000000. running mean: -20.121453
episode 1335.000000, reward total was -21.000000. running mean: -20.130239
episode 1336.000000, reward total was -21.000000. running mean: -20.138936
episode 1337.000000, reward total was -20.000000. running mean: -20.137547
episode 1338.000000, reward total was -21.000000. running mean: -20.146172
episode 1339.000000, reward total was -21.000000. running mean: -20.154710
episode 1340.000000, reward total was -20.000000. running mean: -20.153163
episode 1341.000000, reward total was -20.000000. running mean: -20.151631
episode 1342.000000, reward total was -20.000000. running mean: -20.150115
episode 1343.000000, reward total was -21.000000. running mean: -20.158614
episode 1344.000000, reward total was -21.000000. running mean: -20.167028
episode 1345.000000, rewa

episode 1442.000000, reward total was -20.000000. running mean: -20.057933
episode 1443.000000, reward total was -19.000000. running mean: -20.047354
episode 1444.000000, reward total was -19.000000. running mean: -20.036880
episode 1445.000000, reward total was -20.000000. running mean: -20.036511
episode 1446.000000, reward total was -19.000000. running mean: -20.026146
episode 1447.000000, reward total was -19.000000. running mean: -20.015885
episode 1448.000000, reward total was -19.000000. running mean: -20.005726
episode 1449.000000, reward total was -21.000000. running mean: -20.015669
episode 1450.000000, reward total was -18.000000. running mean: -19.995512
episode 1451.000000, reward total was -21.000000. running mean: -20.005557
episode 1452.000000, reward total was -20.000000. running mean: -20.005501
episode 1453.000000, reward total was -21.000000. running mean: -20.015446
episode 1454.000000, reward total was -19.000000. running mean: -20.005292
episode 1455.000000, rewa

episode 1552.000000, reward total was -21.000000. running mean: -19.940240
episode 1553.000000, reward total was -20.000000. running mean: -19.940837
episode 1554.000000, reward total was -18.000000. running mean: -19.921429
episode 1555.000000, reward total was -20.000000. running mean: -19.922214
episode 1556.000000, reward total was -21.000000. running mean: -19.932992
episode 1557.000000, reward total was -18.000000. running mean: -19.913662
episode 1558.000000, reward total was -21.000000. running mean: -19.924526
episode 1559.000000, reward total was -20.000000. running mean: -19.925281
episode 1560.000000, reward total was -21.000000. running mean: -19.936028
episode 1561.000000, reward total was -21.000000. running mean: -19.946667
episode 1562.000000, reward total was -20.000000. running mean: -19.947201
episode 1563.000000, reward total was -19.000000. running mean: -19.937729
episode 1564.000000, reward total was -20.000000. running mean: -19.938351
episode 1565.000000, rewa

episode 1662.000000, reward total was -20.000000. running mean: -19.797386
episode 1663.000000, reward total was -20.000000. running mean: -19.799413
episode 1664.000000, reward total was -18.000000. running mean: -19.781418
episode 1665.000000, reward total was -21.000000. running mean: -19.793604
episode 1666.000000, reward total was -21.000000. running mean: -19.805668
episode 1667.000000, reward total was -21.000000. running mean: -19.817612
episode 1668.000000, reward total was -20.000000. running mean: -19.819435
episode 1669.000000, reward total was -21.000000. running mean: -19.831241
episode 1670.000000, reward total was -20.000000. running mean: -19.832929
episode 1671.000000, reward total was -21.000000. running mean: -19.844599
episode 1672.000000, reward total was -21.000000. running mean: -19.856153
episode 1673.000000, reward total was -19.000000. running mean: -19.847592
episode 1674.000000, reward total was -19.000000. running mean: -19.839116
episode 1675.000000, rewa

episode 1772.000000, reward total was -21.000000. running mean: -19.741835
episode 1773.000000, reward total was -20.000000. running mean: -19.744417
episode 1774.000000, reward total was -20.000000. running mean: -19.746972
episode 1775.000000, reward total was -21.000000. running mean: -19.759503
episode 1776.000000, reward total was -21.000000. running mean: -19.771908
episode 1777.000000, reward total was -19.000000. running mean: -19.764189
episode 1778.000000, reward total was -20.000000. running mean: -19.766547
episode 1779.000000, reward total was -19.000000. running mean: -19.758881
episode 1780.000000, reward total was -20.000000. running mean: -19.761292
episode 1781.000000, reward total was -20.000000. running mean: -19.763679
episode 1782.000000, reward total was -21.000000. running mean: -19.776043
episode 1783.000000, reward total was -21.000000. running mean: -19.788282
episode 1784.000000, reward total was -20.000000. running mean: -19.790399
episode 1785.000000, rewa

episode 1882.000000, reward total was -19.000000. running mean: -19.739448
episode 1883.000000, reward total was -17.000000. running mean: -19.712054
episode 1884.000000, reward total was -19.000000. running mean: -19.704933
episode 1885.000000, reward total was -20.000000. running mean: -19.707884
episode 1886.000000, reward total was -21.000000. running mean: -19.720805
episode 1887.000000, reward total was -18.000000. running mean: -19.703597
episode 1888.000000, reward total was -19.000000. running mean: -19.696561
episode 1889.000000, reward total was -21.000000. running mean: -19.709595
episode 1890.000000, reward total was -21.000000. running mean: -19.722499
episode 1891.000000, reward total was -20.000000. running mean: -19.725274
episode 1892.000000, reward total was -18.000000. running mean: -19.708022
episode 1893.000000, reward total was -20.000000. running mean: -19.710942
episode 1894.000000, reward total was -18.000000. running mean: -19.693832
episode 1895.000000, rewa

episode 1992.000000, reward total was -21.000000. running mean: -19.779613
episode 1993.000000, reward total was -18.000000. running mean: -19.761817
episode 1994.000000, reward total was -21.000000. running mean: -19.774199
episode 1995.000000, reward total was -18.000000. running mean: -19.756457
episode 1996.000000, reward total was -20.000000. running mean: -19.758892
episode 1997.000000, reward total was -16.000000. running mean: -19.721303
episode 1998.000000, reward total was -18.000000. running mean: -19.704090
episode 1999.000000, reward total was -21.000000. running mean: -19.717049
episode 2000.000000, reward total was -20.000000. running mean: -19.719879
episode 2001.000000, reward total was -18.000000. running mean: -19.702680
episode 2002.000000, reward total was -21.000000. running mean: -19.715653
episode 2003.000000, reward total was -21.000000. running mean: -19.728497
episode 2004.000000, reward total was -18.000000. running mean: -19.711212
episode 2005.000000, rewa

episode 2102.000000, reward total was -17.000000. running mean: -19.766678
episode 2103.000000, reward total was -20.000000. running mean: -19.769011
episode 2104.000000, reward total was -20.000000. running mean: -19.771321
episode 2105.000000, reward total was -20.000000. running mean: -19.773608
episode 2106.000000, reward total was -20.000000. running mean: -19.775872
episode 2107.000000, reward total was -21.000000. running mean: -19.788113
episode 2108.000000, reward total was -19.000000. running mean: -19.780232
episode 2109.000000, reward total was -18.000000. running mean: -19.762429
episode 2110.000000, reward total was -18.000000. running mean: -19.744805
episode 2111.000000, reward total was -19.000000. running mean: -19.737357
episode 2112.000000, reward total was -19.000000. running mean: -19.729984
episode 2113.000000, reward total was -20.000000. running mean: -19.732684
episode 2114.000000, reward total was -20.000000. running mean: -19.735357
episode 2115.000000, rewa

episode 2212.000000, reward total was -21.000000. running mean: -19.329072
episode 2213.000000, reward total was -21.000000. running mean: -19.345781
episode 2214.000000, reward total was -21.000000. running mean: -19.362323
episode 2215.000000, reward total was -21.000000. running mean: -19.378700
episode 2216.000000, reward total was -20.000000. running mean: -19.384913
episode 2217.000000, reward total was -21.000000. running mean: -19.401064
episode 2218.000000, reward total was -21.000000. running mean: -19.417053
episode 2219.000000, reward total was -20.000000. running mean: -19.422883
episode 2220.000000, reward total was -20.000000. running mean: -19.428654
episode 2221.000000, reward total was -19.000000. running mean: -19.424367
episode 2222.000000, reward total was -19.000000. running mean: -19.420124
episode 2223.000000, reward total was -21.000000. running mean: -19.435922
episode 2224.000000, reward total was -21.000000. running mean: -19.451563
episode 2225.000000, rewa

episode 2322.000000, reward total was -19.000000. running mean: -19.484988
episode 2323.000000, reward total was -20.000000. running mean: -19.490138
episode 2324.000000, reward total was -20.000000. running mean: -19.495237
episode 2325.000000, reward total was -20.000000. running mean: -19.500284
episode 2326.000000, reward total was -19.000000. running mean: -19.495281
episode 2327.000000, reward total was -21.000000. running mean: -19.510329
episode 2328.000000, reward total was -20.000000. running mean: -19.515225
episode 2329.000000, reward total was -20.000000. running mean: -19.520073
episode 2330.000000, reward total was -19.000000. running mean: -19.514872
episode 2331.000000, reward total was -20.000000. running mean: -19.519724
episode 2332.000000, reward total was -19.000000. running mean: -19.514526
episode 2333.000000, reward total was -21.000000. running mean: -19.529381
episode 2334.000000, reward total was -20.000000. running mean: -19.534087
episode 2335.000000, rewa

episode 2432.000000, reward total was -17.000000. running mean: -19.442627
episode 2433.000000, reward total was -19.000000. running mean: -19.438201
episode 2434.000000, reward total was -18.000000. running mean: -19.423819
episode 2435.000000, reward total was -20.000000. running mean: -19.429580
episode 2436.000000, reward total was -19.000000. running mean: -19.425285
episode 2437.000000, reward total was -21.000000. running mean: -19.441032
episode 2438.000000, reward total was -21.000000. running mean: -19.456621
episode 2439.000000, reward total was -21.000000. running mean: -19.472055
episode 2440.000000, reward total was -21.000000. running mean: -19.487335
episode 2441.000000, reward total was -18.000000. running mean: -19.472461
episode 2442.000000, reward total was -18.000000. running mean: -19.457737
episode 2443.000000, reward total was -20.000000. running mean: -19.463159
episode 2444.000000, reward total was -20.000000. running mean: -19.468528
episode 2445.000000, rewa

episode 2542.000000, reward total was -21.000000. running mean: -19.415636
episode 2543.000000, reward total was -20.000000. running mean: -19.421480
episode 2544.000000, reward total was -21.000000. running mean: -19.437265
episode 2545.000000, reward total was -21.000000. running mean: -19.452892
episode 2546.000000, reward total was -21.000000. running mean: -19.468363
episode 2547.000000, reward total was -19.000000. running mean: -19.463680
episode 2548.000000, reward total was -21.000000. running mean: -19.479043
episode 2549.000000, reward total was -18.000000. running mean: -19.464253
episode 2550.000000, reward total was -20.000000. running mean: -19.469610
episode 2551.000000, reward total was -18.000000. running mean: -19.454914
episode 2552.000000, reward total was -17.000000. running mean: -19.430365
episode 2553.000000, reward total was -17.000000. running mean: -19.406061
episode 2554.000000, reward total was -18.000000. running mean: -19.392001
episode 2555.000000, rewa

episode 2652.000000, reward total was -20.000000. running mean: -19.250516
episode 2653.000000, reward total was -20.000000. running mean: -19.258011
episode 2654.000000, reward total was -18.000000. running mean: -19.245431
episode 2655.000000, reward total was -16.000000. running mean: -19.212977
episode 2656.000000, reward total was -18.000000. running mean: -19.200847
episode 2657.000000, reward total was -20.000000. running mean: -19.208839
episode 2658.000000, reward total was -20.000000. running mean: -19.216750
episode 2659.000000, reward total was -21.000000. running mean: -19.234583
episode 2660.000000, reward total was -20.000000. running mean: -19.242237
episode 2661.000000, reward total was -19.000000. running mean: -19.239815
episode 2662.000000, reward total was -20.000000. running mean: -19.247416
episode 2663.000000, reward total was -20.000000. running mean: -19.254942
episode 2664.000000, reward total was -20.000000. running mean: -19.262393
episode 2665.000000, rewa

episode 2762.000000, reward total was -18.000000. running mean: -19.218442
episode 2763.000000, reward total was -21.000000. running mean: -19.236257
episode 2764.000000, reward total was -19.000000. running mean: -19.233895
episode 2765.000000, reward total was -19.000000. running mean: -19.231556
episode 2766.000000, reward total was -20.000000. running mean: -19.239240
episode 2767.000000, reward total was -20.000000. running mean: -19.246848
episode 2768.000000, reward total was -19.000000. running mean: -19.244379
episode 2769.000000, reward total was -19.000000. running mean: -19.241936
episode 2770.000000, reward total was -19.000000. running mean: -19.239516
episode 2771.000000, reward total was -21.000000. running mean: -19.257121
episode 2772.000000, reward total was -18.000000. running mean: -19.244550
episode 2773.000000, reward total was -18.000000. running mean: -19.232104
episode 2774.000000, reward total was -20.000000. running mean: -19.239783
episode 2775.000000, rewa

episode 2872.000000, reward total was -17.000000. running mean: -19.182316
episode 2873.000000, reward total was -17.000000. running mean: -19.160493
episode 2874.000000, reward total was -19.000000. running mean: -19.158888
episode 2875.000000, reward total was -21.000000. running mean: -19.177299
episode 2876.000000, reward total was -19.000000. running mean: -19.175526
episode 2877.000000, reward total was -15.000000. running mean: -19.133771
episode 2878.000000, reward total was -20.000000. running mean: -19.142433
episode 2879.000000, reward total was -20.000000. running mean: -19.151009
episode 2880.000000, reward total was -20.000000. running mean: -19.159499
episode 2881.000000, reward total was -19.000000. running mean: -19.157904
episode 2882.000000, reward total was -20.000000. running mean: -19.166325
episode 2883.000000, reward total was -16.000000. running mean: -19.134662
episode 2884.000000, reward total was -19.000000. running mean: -19.133315
episode 2885.000000, rewa

episode 2982.000000, reward total was -18.000000. running mean: -19.085171
episode 2983.000000, reward total was -20.000000. running mean: -19.094320
episode 2984.000000, reward total was -19.000000. running mean: -19.093376
episode 2985.000000, reward total was -20.000000. running mean: -19.102443
episode 2986.000000, reward total was -18.000000. running mean: -19.091418
episode 2987.000000, reward total was -18.000000. running mean: -19.080504
episode 2988.000000, reward total was -20.000000. running mean: -19.089699
episode 2989.000000, reward total was -20.000000. running mean: -19.098802
episode 2990.000000, reward total was -20.000000. running mean: -19.107814
episode 2991.000000, reward total was -18.000000. running mean: -19.096736
episode 2992.000000, reward total was -19.000000. running mean: -19.095769
episode 2993.000000, reward total was -19.000000. running mean: -19.094811
episode 2994.000000, reward total was -21.000000. running mean: -19.113863
episode 2995.000000, rewa

episode 3092.000000, reward total was -19.000000. running mean: -19.090829
episode 3093.000000, reward total was -19.000000. running mean: -19.089920
episode 3094.000000, reward total was -19.000000. running mean: -19.089021
episode 3095.000000, reward total was -20.000000. running mean: -19.098131
episode 3096.000000, reward total was -16.000000. running mean: -19.067150
episode 3097.000000, reward total was -19.000000. running mean: -19.066478
episode 3098.000000, reward total was -19.000000. running mean: -19.065813
episode 3099.000000, reward total was -19.000000. running mean: -19.065155
episode 3100.000000, reward total was -20.000000. running mean: -19.074504
episode 3101.000000, reward total was -18.000000. running mean: -19.063759
episode 3102.000000, reward total was -18.000000. running mean: -19.053121
episode 3103.000000, reward total was -19.000000. running mean: -19.052590
episode 3104.000000, reward total was -19.000000. running mean: -19.052064
episode 3105.000000, rewa

episode 3202.000000, reward total was -19.000000. running mean: -19.024557
episode 3203.000000, reward total was -18.000000. running mean: -19.014312
episode 3204.000000, reward total was -19.000000. running mean: -19.014169
episode 3205.000000, reward total was -19.000000. running mean: -19.014027
episode 3206.000000, reward total was -16.000000. running mean: -18.983887
episode 3207.000000, reward total was -17.000000. running mean: -18.964048
episode 3208.000000, reward total was -16.000000. running mean: -18.934407
episode 3209.000000, reward total was -17.000000. running mean: -18.915063
episode 3210.000000, reward total was -21.000000. running mean: -18.935913
episode 3211.000000, reward total was -18.000000. running mean: -18.926554
episode 3212.000000, reward total was -21.000000. running mean: -18.947288
episode 3213.000000, reward total was -20.000000. running mean: -18.957815
episode 3214.000000, reward total was -19.000000. running mean: -18.958237
episode 3215.000000, rewa

episode 3312.000000, reward total was -20.000000. running mean: -18.823456
episode 3313.000000, reward total was -18.000000. running mean: -18.815221
episode 3314.000000, reward total was -19.000000. running mean: -18.817069
episode 3315.000000, reward total was -15.000000. running mean: -18.778898
episode 3316.000000, reward total was -20.000000. running mean: -18.791109
episode 3317.000000, reward total was -19.000000. running mean: -18.793198
episode 3318.000000, reward total was -20.000000. running mean: -18.805266
episode 3319.000000, reward total was -21.000000. running mean: -18.827213
episode 3320.000000, reward total was -20.000000. running mean: -18.838941
episode 3321.000000, reward total was -20.000000. running mean: -18.850552
episode 3322.000000, reward total was -17.000000. running mean: -18.832046
episode 3323.000000, reward total was -19.000000. running mean: -18.833726
episode 3324.000000, reward total was -19.000000. running mean: -18.835389
episode 3325.000000, rewa

episode 3422.000000, reward total was -18.000000. running mean: -18.961364
episode 3423.000000, reward total was -18.000000. running mean: -18.951751
episode 3424.000000, reward total was -20.000000. running mean: -18.962233
episode 3425.000000, reward total was -20.000000. running mean: -18.972611
episode 3426.000000, reward total was -14.000000. running mean: -18.922885
episode 3427.000000, reward total was -17.000000. running mean: -18.903656
episode 3428.000000, reward total was -17.000000. running mean: -18.884619
episode 3429.000000, reward total was -20.000000. running mean: -18.895773
episode 3430.000000, reward total was -18.000000. running mean: -18.886816
episode 3431.000000, reward total was -19.000000. running mean: -18.887947
episode 3432.000000, reward total was -21.000000. running mean: -18.909068
episode 3433.000000, reward total was -19.000000. running mean: -18.909977
episode 3434.000000, reward total was -18.000000. running mean: -18.900877
episode 3435.000000, rewa

episode 3532.000000, reward total was -19.000000. running mean: -18.729454
episode 3533.000000, reward total was -21.000000. running mean: -18.752159
episode 3534.000000, reward total was -19.000000. running mean: -18.754638
episode 3535.000000, reward total was -20.000000. running mean: -18.767091
episode 3536.000000, reward total was -19.000000. running mean: -18.769420
episode 3537.000000, reward total was -17.000000. running mean: -18.751726
episode 3538.000000, reward total was -19.000000. running mean: -18.754209
episode 3539.000000, reward total was -17.000000. running mean: -18.736667
episode 3540.000000, reward total was -19.000000. running mean: -18.739300
episode 3541.000000, reward total was -18.000000. running mean: -18.731907
episode 3542.000000, reward total was -21.000000. running mean: -18.754588
episode 3543.000000, reward total was -20.000000. running mean: -18.767042
episode 3544.000000, reward total was -19.000000. running mean: -18.769372
episode 3545.000000, rewa

episode 3642.000000, reward total was -17.000000. running mean: -18.452616
episode 3643.000000, reward total was -17.000000. running mean: -18.438089
episode 3644.000000, reward total was -19.000000. running mean: -18.443709
episode 3645.000000, reward total was -19.000000. running mean: -18.449271
episode 3646.000000, reward total was -18.000000. running mean: -18.444779
episode 3647.000000, reward total was -16.000000. running mean: -18.420331
episode 3648.000000, reward total was -16.000000. running mean: -18.396128
episode 3649.000000, reward total was -21.000000. running mean: -18.422166
episode 3650.000000, reward total was -19.000000. running mean: -18.427945
episode 3651.000000, reward total was -18.000000. running mean: -18.423665
episode 3652.000000, reward total was -21.000000. running mean: -18.449429
episode 3653.000000, reward total was -20.000000. running mean: -18.464934
episode 3654.000000, reward total was -20.000000. running mean: -18.480285
episode 3655.000000, rewa

episode 3752.000000, reward total was -18.000000. running mean: -18.505626
episode 3753.000000, reward total was -19.000000. running mean: -18.510570
episode 3754.000000, reward total was -18.000000. running mean: -18.505464
episode 3755.000000, reward total was -18.000000. running mean: -18.500410
episode 3756.000000, reward total was -17.000000. running mean: -18.485406
episode 3757.000000, reward total was -18.000000. running mean: -18.480552
episode 3758.000000, reward total was -18.000000. running mean: -18.475746
episode 3759.000000, reward total was -19.000000. running mean: -18.480989
episode 3760.000000, reward total was -17.000000. running mean: -18.466179
episode 3761.000000, reward total was -20.000000. running mean: -18.481517
episode 3762.000000, reward total was -19.000000. running mean: -18.486702
episode 3763.000000, reward total was -16.000000. running mean: -18.461835
episode 3764.000000, reward total was -19.000000. running mean: -18.467216
episode 3765.000000, rewa

episode 3862.000000, reward total was -19.000000. running mean: -18.339371
episode 3863.000000, reward total was -12.000000. running mean: -18.275977
episode 3864.000000, reward total was -20.000000. running mean: -18.293217
episode 3865.000000, reward total was -19.000000. running mean: -18.300285
episode 3866.000000, reward total was -18.000000. running mean: -18.297282
episode 3867.000000, reward total was -18.000000. running mean: -18.294310
episode 3868.000000, reward total was -17.000000. running mean: -18.281367
episode 3869.000000, reward total was -19.000000. running mean: -18.288553
episode 3870.000000, reward total was -19.000000. running mean: -18.295667
episode 3871.000000, reward total was -17.000000. running mean: -18.282711
episode 3872.000000, reward total was -17.000000. running mean: -18.269884
episode 3873.000000, reward total was -16.000000. running mean: -18.247185
episode 3874.000000, reward total was -16.000000. running mean: -18.224713
episode 3875.000000, rewa

episode 3972.000000, reward total was -21.000000. running mean: -18.176259
episode 3973.000000, reward total was -18.000000. running mean: -18.174496
episode 3974.000000, reward total was -18.000000. running mean: -18.172751
episode 3975.000000, reward total was -17.000000. running mean: -18.161024
episode 3976.000000, reward total was -16.000000. running mean: -18.139414
episode 3977.000000, reward total was -17.000000. running mean: -18.128019
episode 3978.000000, reward total was -17.000000. running mean: -18.116739
episode 3979.000000, reward total was -19.000000. running mean: -18.125572
episode 3980.000000, reward total was -18.000000. running mean: -18.124316
episode 3981.000000, reward total was -20.000000. running mean: -18.143073
episode 3982.000000, reward total was -18.000000. running mean: -18.141642
episode 3983.000000, reward total was -17.000000. running mean: -18.130226
episode 3984.000000, reward total was -19.000000. running mean: -18.138924
episode 3985.000000, rewa

episode 4082.000000, reward total was -19.000000. running mean: -18.006203
episode 4083.000000, reward total was -20.000000. running mean: -18.026141
episode 4084.000000, reward total was -18.000000. running mean: -18.025879
episode 4085.000000, reward total was -18.000000. running mean: -18.025621
episode 4086.000000, reward total was -18.000000. running mean: -18.025364
episode 4087.000000, reward total was -20.000000. running mean: -18.045111
episode 4088.000000, reward total was -17.000000. running mean: -18.034660
episode 4089.000000, reward total was -17.000000. running mean: -18.024313
episode 4090.000000, reward total was -19.000000. running mean: -18.034070
episode 4091.000000, reward total was -16.000000. running mean: -18.013729
episode 4092.000000, reward total was -19.000000. running mean: -18.023592
episode 4093.000000, reward total was -14.000000. running mean: -17.983356
episode 4094.000000, reward total was -18.000000. running mean: -17.983522
episode 4095.000000, rewa

episode 4192.000000, reward total was -19.000000. running mean: -17.810897
episode 4193.000000, reward total was -19.000000. running mean: -17.822788
episode 4194.000000, reward total was -19.000000. running mean: -17.834560
episode 4195.000000, reward total was -19.000000. running mean: -17.846214
episode 4196.000000, reward total was -20.000000. running mean: -17.867752
episode 4197.000000, reward total was -18.000000. running mean: -17.869075
episode 4198.000000, reward total was -17.000000. running mean: -17.860384
episode 4199.000000, reward total was -17.000000. running mean: -17.851780
episode 4200.000000, reward total was -21.000000. running mean: -17.883262
episode 4201.000000, reward total was -20.000000. running mean: -17.904430
episode 4202.000000, reward total was -18.000000. running mean: -17.905385
episode 4203.000000, reward total was -20.000000. running mean: -17.926332
episode 4204.000000, reward total was -18.000000. running mean: -17.927068
episode 4205.000000, rewa

episode 4302.000000, reward total was -17.000000. running mean: -17.860717
episode 4303.000000, reward total was -17.000000. running mean: -17.852110
episode 4304.000000, reward total was -16.000000. running mean: -17.833589
episode 4305.000000, reward total was -18.000000. running mean: -17.835253
episode 4306.000000, reward total was -20.000000. running mean: -17.856900
episode 4307.000000, reward total was -21.000000. running mean: -17.888331
episode 4308.000000, reward total was -15.000000. running mean: -17.859448
episode 4309.000000, reward total was -15.000000. running mean: -17.830853
episode 4310.000000, reward total was -17.000000. running mean: -17.822545
episode 4311.000000, reward total was -16.000000. running mean: -17.804319
episode 4312.000000, reward total was -19.000000. running mean: -17.816276
episode 4313.000000, reward total was -17.000000. running mean: -17.808113
episode 4314.000000, reward total was -17.000000. running mean: -17.800032
episode 4315.000000, rewa

episode 4412.000000, reward total was -15.000000. running mean: -17.579248
episode 4413.000000, reward total was -16.000000. running mean: -17.563455
episode 4414.000000, reward total was -18.000000. running mean: -17.567821
episode 4415.000000, reward total was -18.000000. running mean: -17.572143
episode 4416.000000, reward total was -19.000000. running mean: -17.586421
episode 4417.000000, reward total was -19.000000. running mean: -17.600557
episode 4418.000000, reward total was -17.000000. running mean: -17.594552
episode 4419.000000, reward total was -21.000000. running mean: -17.628606
episode 4420.000000, reward total was -19.000000. running mean: -17.642320
episode 4421.000000, reward total was -20.000000. running mean: -17.665897
episode 4422.000000, reward total was -14.000000. running mean: -17.629238
episode 4423.000000, reward total was -20.000000. running mean: -17.652945
episode 4424.000000, reward total was -21.000000. running mean: -17.686416
episode 4425.000000, rewa

episode 4522.000000, reward total was -16.000000. running mean: -17.261113
episode 4523.000000, reward total was -18.000000. running mean: -17.268502
episode 4524.000000, reward total was -14.000000. running mean: -17.235817
episode 4525.000000, reward total was -14.000000. running mean: -17.203458
episode 4526.000000, reward total was -21.000000. running mean: -17.241424
episode 4527.000000, reward total was -18.000000. running mean: -17.249010
episode 4528.000000, reward total was -13.000000. running mean: -17.206519
episode 4529.000000, reward total was -19.000000. running mean: -17.224454
episode 4530.000000, reward total was -19.000000. running mean: -17.242210
episode 4531.000000, reward total was -15.000000. running mean: -17.219788
episode 4532.000000, reward total was -16.000000. running mean: -17.207590
episode 4533.000000, reward total was -20.000000. running mean: -17.235514
episode 4534.000000, reward total was -16.000000. running mean: -17.223159
episode 4535.000000, rewa

episode 4632.000000, reward total was -17.000000. running mean: -17.007743
episode 4633.000000, reward total was -15.000000. running mean: -16.987666
episode 4634.000000, reward total was -21.000000. running mean: -17.027789
episode 4635.000000, reward total was -16.000000. running mean: -17.017511
episode 4636.000000, reward total was -17.000000. running mean: -17.017336
episode 4637.000000, reward total was -19.000000. running mean: -17.037163
episode 4638.000000, reward total was -18.000000. running mean: -17.046791
episode 4639.000000, reward total was -19.000000. running mean: -17.066323
episode 4640.000000, reward total was -18.000000. running mean: -17.075660
episode 4641.000000, reward total was -18.000000. running mean: -17.084904
episode 4642.000000, reward total was -16.000000. running mean: -17.074055
episode 4643.000000, reward total was -17.000000. running mean: -17.073314
episode 4644.000000, reward total was -19.000000. running mean: -17.092581
episode 4645.000000, rewa

episode 4742.000000, reward total was -18.000000. running mean: -17.160002
episode 4743.000000, reward total was -16.000000. running mean: -17.148402
episode 4744.000000, reward total was -19.000000. running mean: -17.166918
episode 4745.000000, reward total was -17.000000. running mean: -17.165249
episode 4746.000000, reward total was -15.000000. running mean: -17.143596
episode 4747.000000, reward total was -14.000000. running mean: -17.112160
episode 4748.000000, reward total was -18.000000. running mean: -17.121039
episode 4749.000000, reward total was -17.000000. running mean: -17.119828
episode 4750.000000, reward total was -13.000000. running mean: -17.078630
episode 4751.000000, reward total was -13.000000. running mean: -17.037844
episode 4752.000000, reward total was -17.000000. running mean: -17.037465
episode 4753.000000, reward total was -14.000000. running mean: -17.007091
episode 4754.000000, reward total was -16.000000. running mean: -16.997020
episode 4755.000000, rewa

episode 4852.000000, reward total was -13.000000. running mean: -16.899086
episode 4853.000000, reward total was -20.000000. running mean: -16.930095
episode 4854.000000, reward total was -18.000000. running mean: -16.940794
episode 4855.000000, reward total was -17.000000. running mean: -16.941386
episode 4856.000000, reward total was -17.000000. running mean: -16.941972
episode 4857.000000, reward total was -20.000000. running mean: -16.972552
episode 4858.000000, reward total was -13.000000. running mean: -16.932827
episode 4859.000000, reward total was -18.000000. running mean: -16.943499
episode 4860.000000, reward total was -18.000000. running mean: -16.954064
episode 4861.000000, reward total was -14.000000. running mean: -16.924523
episode 4862.000000, reward total was -18.000000. running mean: -16.935278
episode 4863.000000, reward total was -17.000000. running mean: -16.935925
episode 4864.000000, reward total was -14.000000. running mean: -16.906566
episode 4865.000000, rewa

episode 4962.000000, reward total was -21.000000. running mean: -16.964177
episode 4963.000000, reward total was -17.000000. running mean: -16.964536
episode 4964.000000, reward total was -13.000000. running mean: -16.924890
episode 4965.000000, reward total was -19.000000. running mean: -16.945641
episode 4966.000000, reward total was -16.000000. running mean: -16.936185
episode 4967.000000, reward total was -17.000000. running mean: -16.936823
episode 4968.000000, reward total was -16.000000. running mean: -16.927455
episode 4969.000000, reward total was -14.000000. running mean: -16.898180
episode 4970.000000, reward total was -19.000000. running mean: -16.919198
episode 4971.000000, reward total was -15.000000. running mean: -16.900006
episode 4972.000000, reward total was -18.000000. running mean: -16.911006
episode 4973.000000, reward total was -17.000000. running mean: -16.911896
episode 4974.000000, reward total was -18.000000. running mean: -16.922777
episode 4975.000000, rewa

episode 5072.000000, reward total was -15.000000. running mean: -16.790737
episode 5073.000000, reward total was -16.000000. running mean: -16.782829
episode 5074.000000, reward total was -14.000000. running mean: -16.755001
episode 5075.000000, reward total was -11.000000. running mean: -16.697451
episode 5076.000000, reward total was -17.000000. running mean: -16.700477
episode 5077.000000, reward total was -16.000000. running mean: -16.693472
episode 5078.000000, reward total was -18.000000. running mean: -16.706537
episode 5079.000000, reward total was -19.000000. running mean: -16.729472
episode 5080.000000, reward total was -17.000000. running mean: -16.732177
