In [1]:
""" Trains an agent with (stochastic) Policy Gradients on Pong. Uses OpenAI Gym. """
import numpy as np
import gym

# hyperparameters
H = 200 # number of hidden layer neurons
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-4
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
resume = False # resume from previous checkpoint?
render = True

# model initialization
D = 80 * 80 # input dimensionality: 80x80 grid

model = {}
model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
model['W2'] = np.random.randn(H) / np.sqrt(H)

print(model['W1'].shape)
print(model['W2'].shape)

grad_buffer = { k : np.zeros_like(v) for k,v in model.items() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.items() } # rmsprop memory

def sigmoid(x): 
    return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]

def prepro(I):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
    I = I[35:195] # crop
    I = I[::2,::2,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return I.astype(np.float).ravel()

def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

def policy_forward(x):
    h = np.dot(model['W1'], x)
    h[h<0] = 0 # ReLU nonlinearity
    logp = np.dot(model['W2'], h)
    p = sigmoid(logp)
    return p, h # return probability of taking action 2, and hidden state

def policy_backward(eph, epdlogp):
    """ backward pass. (eph is array of intermediate hidden states) """
    dW2 = np.dot(eph.T, epdlogp).ravel()
    dh = np.outer(epdlogp, model['W2'])
    dh[eph <= 0] = 0 # backpro prelu
    dW1 = np.dot(dh.T, epx)
    return {'W1':dW1, 'W2':dW2}

(200, 6400)
(200,)


In [None]:

env = gym.make("Pong-v0")
observation = env.reset()
prev_x = None # used in computing the difference frame
xs,hs,dlogps,drs = [],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 0
while True:
    if render: env.render()

    # preprocess the observation, set input to network to be difference image
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    prev_x = cur_x

    # forward the policy network and sample an action from the returned probability
    aprob, h = policy_forward(x)
    action = 2 if np.random.uniform() < aprob else 3 # roll the dice!

    # record various intermediates (needed later for backprop)
    xs.append(x) # observation
    hs.append(h) # hidden state
    y = 1 if action == 2 else 0 # a "fake label"
    dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

    if done: # an episode finished
        episode_number += 1

        # stack together all inputs, hidden states, action gradients, and rewards for this episode
        epx = np.vstack(xs)
        eph = np.vstack(hs)
        epdlogp = np.vstack(dlogps)
        epr = np.vstack(drs)
        xs,hs,dlogps,drs = [],[],[],[] # reset array memory

        # compute the discounted reward backwards through time
        discounted_epr = discount_rewards(epr)
        # standardize the rewards to be unit normal (helps control the gradient estimator variance)
        discounted_epr -= np.mean(discounted_epr)
        discounted_epr /= np.std(discounted_epr)

        epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
        grad = policy_backward(eph, epdlogp)
        for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

        # perform rmsprop parameter update every batch_size episodes
        if episode_number % batch_size == 0:
            for k,v in model.items():
                g = grad_buffer[k] # gradient
                rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
                model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
                grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

        # boring book-keeping
        running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
        print('resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward))
        #if episode_number % 100 == 0: pickle.dump(model, open('save.p', 'wb'))
        reward_sum = 0
        observation = env.reset() # reset env
        prev_x = None


resetting env. episode reward total was -21.000000. running mean: -21.000000
resetting env. episode reward total was -21.000000. running mean: -21.000000
resetting env. episode reward total was -21.000000. running mean: -21.000000
resetting env. episode reward total was -21.000000. running mean: -21.000000
resetting env. episode reward total was -21.000000. running mean: -21.000000
resetting env. episode reward total was -21.000000. running mean: -21.000000
resetting env. episode reward total was -21.000000. running mean: -21.000000
resetting env. episode reward total was -21.000000. running mean: -21.000000
resetting env. episode reward total was -21.000000. running mean: -21.000000
resetting env. episode reward total was -21.000000. running mean: -21.000000
resetting env. episode reward total was -21.000000. running mean: -21.000000
resetting env. episode reward total was -21.000000. running mean: -21.000000
resetting env. episode reward total was -21.000000. running mean: -21.000000

resetting env. episode reward total was -17.000000. running mean: -20.638863
resetting env. episode reward total was -20.000000. running mean: -20.632475
resetting env. episode reward total was -21.000000. running mean: -20.636150
resetting env. episode reward total was -20.000000. running mean: -20.629789
resetting env. episode reward total was -21.000000. running mean: -20.633491
resetting env. episode reward total was -20.000000. running mean: -20.627156
resetting env. episode reward total was -21.000000. running mean: -20.630884
resetting env. episode reward total was -21.000000. running mean: -20.634575
resetting env. episode reward total was -20.000000. running mean: -20.628230
resetting env. episode reward total was -21.000000. running mean: -20.631947
resetting env. episode reward total was -20.000000. running mean: -20.625628
resetting env. episode reward total was -20.000000. running mean: -20.619372
resetting env. episode reward total was -21.000000. running mean: -20.623178

resetting env. episode reward total was -21.000000. running mean: -20.552122
resetting env. episode reward total was -21.000000. running mean: -20.556601
resetting env. episode reward total was -21.000000. running mean: -20.561035
resetting env. episode reward total was -20.000000. running mean: -20.555425
resetting env. episode reward total was -20.000000. running mean: -20.549870
resetting env. episode reward total was -20.000000. running mean: -20.544372
resetting env. episode reward total was -21.000000. running mean: -20.548928
resetting env. episode reward total was -19.000000. running mean: -20.533439
resetting env. episode reward total was -21.000000. running mean: -20.538104
resetting env. episode reward total was -18.000000. running mean: -20.512723
resetting env. episode reward total was -21.000000. running mean: -20.517596
resetting env. episode reward total was -20.000000. running mean: -20.512420
resetting env. episode reward total was -21.000000. running mean: -20.517296

resetting env. episode reward total was -21.000000. running mean: -20.496387
resetting env. episode reward total was -21.000000. running mean: -20.501423
resetting env. episode reward total was -18.000000. running mean: -20.476409
resetting env. episode reward total was -21.000000. running mean: -20.481645
resetting env. episode reward total was -21.000000. running mean: -20.486828
resetting env. episode reward total was -20.000000. running mean: -20.481960
resetting env. episode reward total was -19.000000. running mean: -20.467140
resetting env. episode reward total was -20.000000. running mean: -20.462469
resetting env. episode reward total was -20.000000. running mean: -20.457844
resetting env. episode reward total was -21.000000. running mean: -20.463266
resetting env. episode reward total was -18.000000. running mean: -20.438633
resetting env. episode reward total was -21.000000. running mean: -20.444247
resetting env. episode reward total was -20.000000. running mean: -20.439804

resetting env. episode reward total was -21.000000. running mean: -20.391851
resetting env. episode reward total was -21.000000. running mean: -20.397933
resetting env. episode reward total was -21.000000. running mean: -20.403953
resetting env. episode reward total was -20.000000. running mean: -20.399914
resetting env. episode reward total was -21.000000. running mean: -20.405915
resetting env. episode reward total was -21.000000. running mean: -20.411856
resetting env. episode reward total was -21.000000. running mean: -20.417737
resetting env. episode reward total was -21.000000. running mean: -20.423560
resetting env. episode reward total was -19.000000. running mean: -20.409324
resetting env. episode reward total was -20.000000. running mean: -20.405231
resetting env. episode reward total was -21.000000. running mean: -20.411179
resetting env. episode reward total was -21.000000. running mean: -20.417067
resetting env. episode reward total was -21.000000. running mean: -20.422896

resetting env. episode reward total was -20.000000. running mean: -20.366449
resetting env. episode reward total was -21.000000. running mean: -20.372784
resetting env. episode reward total was -21.000000. running mean: -20.379056
resetting env. episode reward total was -18.000000. running mean: -20.355266
resetting env. episode reward total was -21.000000. running mean: -20.361713
resetting env. episode reward total was -20.000000. running mean: -20.358096
resetting env. episode reward total was -19.000000. running mean: -20.344515
resetting env. episode reward total was -21.000000. running mean: -20.351070
resetting env. episode reward total was -21.000000. running mean: -20.357559
resetting env. episode reward total was -19.000000. running mean: -20.343983
resetting env. episode reward total was -21.000000. running mean: -20.350544
resetting env. episode reward total was -19.000000. running mean: -20.337038
resetting env. episode reward total was -21.000000. running mean: -20.343668

resetting env. episode reward total was -21.000000. running mean: -20.380047
resetting env. episode reward total was -20.000000. running mean: -20.376247
resetting env. episode reward total was -21.000000. running mean: -20.382484
resetting env. episode reward total was -21.000000. running mean: -20.388659
resetting env. episode reward total was -21.000000. running mean: -20.394773
resetting env. episode reward total was -21.000000. running mean: -20.400825
resetting env. episode reward total was -20.000000. running mean: -20.396817
resetting env. episode reward total was -21.000000. running mean: -20.402849
resetting env. episode reward total was -21.000000. running mean: -20.408820
resetting env. episode reward total was -19.000000. running mean: -20.394732
resetting env. episode reward total was -20.000000. running mean: -20.390785
resetting env. episode reward total was -19.000000. running mean: -20.376877
resetting env. episode reward total was -20.000000. running mean: -20.373108

resetting env. episode reward total was -21.000000. running mean: -20.376614
resetting env. episode reward total was -21.000000. running mean: -20.382847
resetting env. episode reward total was -21.000000. running mean: -20.389019
resetting env. episode reward total was -20.000000. running mean: -20.385129
resetting env. episode reward total was -21.000000. running mean: -20.391277
resetting env. episode reward total was -20.000000. running mean: -20.387365
resetting env. episode reward total was -21.000000. running mean: -20.393491
resetting env. episode reward total was -21.000000. running mean: -20.399556
resetting env. episode reward total was -21.000000. running mean: -20.405561
resetting env. episode reward total was -20.000000. running mean: -20.401505
resetting env. episode reward total was -21.000000. running mean: -20.407490
resetting env. episode reward total was -20.000000. running mean: -20.403415
resetting env. episode reward total was -21.000000. running mean: -20.409381

resetting env. episode reward total was -21.000000. running mean: -20.348294
resetting env. episode reward total was -21.000000. running mean: -20.354811
resetting env. episode reward total was -19.000000. running mean: -20.341263
resetting env. episode reward total was -20.000000. running mean: -20.337850
resetting env. episode reward total was -20.000000. running mean: -20.334472
resetting env. episode reward total was -21.000000. running mean: -20.341127
resetting env. episode reward total was -21.000000. running mean: -20.347716
resetting env. episode reward total was -17.000000. running mean: -20.314239
resetting env. episode reward total was -20.000000. running mean: -20.311096
resetting env. episode reward total was -21.000000. running mean: -20.317985
resetting env. episode reward total was -21.000000. running mean: -20.324806
resetting env. episode reward total was -19.000000. running mean: -20.311557
resetting env. episode reward total was -21.000000. running mean: -20.318442

resetting env. episode reward total was -21.000000. running mean: -20.381459
resetting env. episode reward total was -21.000000. running mean: -20.387644
resetting env. episode reward total was -21.000000. running mean: -20.393768
resetting env. episode reward total was -21.000000. running mean: -20.399830
resetting env. episode reward total was -21.000000. running mean: -20.405832
resetting env. episode reward total was -21.000000. running mean: -20.411773
resetting env. episode reward total was -21.000000. running mean: -20.417656
resetting env. episode reward total was -21.000000. running mean: -20.423479
resetting env. episode reward total was -21.000000. running mean: -20.429244
resetting env. episode reward total was -21.000000. running mean: -20.434952
resetting env. episode reward total was -21.000000. running mean: -20.440602
resetting env. episode reward total was -20.000000. running mean: -20.436196
resetting env. episode reward total was -20.000000. running mean: -20.431834

resetting env. episode reward total was -21.000000. running mean: -20.344175
resetting env. episode reward total was -19.000000. running mean: -20.330733
resetting env. episode reward total was -21.000000. running mean: -20.337426
resetting env. episode reward total was -19.000000. running mean: -20.324051
resetting env. episode reward total was -20.000000. running mean: -20.320811
resetting env. episode reward total was -19.000000. running mean: -20.307603
resetting env. episode reward total was -21.000000. running mean: -20.314527
resetting env. episode reward total was -20.000000. running mean: -20.311381
resetting env. episode reward total was -21.000000. running mean: -20.318268
resetting env. episode reward total was -21.000000. running mean: -20.325085
resetting env. episode reward total was -21.000000. running mean: -20.331834
resetting env. episode reward total was -21.000000. running mean: -20.338516
resetting env. episode reward total was -21.000000. running mean: -20.345131

resetting env. episode reward total was -20.000000. running mean: -20.412968
resetting env. episode reward total was -20.000000. running mean: -20.408838
resetting env. episode reward total was -21.000000. running mean: -20.414750
resetting env. episode reward total was -21.000000. running mean: -20.420602
resetting env. episode reward total was -21.000000. running mean: -20.426396
resetting env. episode reward total was -20.000000. running mean: -20.422132
resetting env. episode reward total was -20.000000. running mean: -20.417911
resetting env. episode reward total was -20.000000. running mean: -20.413732
resetting env. episode reward total was -20.000000. running mean: -20.409594
resetting env. episode reward total was -19.000000. running mean: -20.395498
resetting env. episode reward total was -21.000000. running mean: -20.401543
resetting env. episode reward total was -21.000000. running mean: -20.407528
resetting env. episode reward total was -21.000000. running mean: -20.413453

resetting env. episode reward total was -20.000000. running mean: -20.423029
resetting env. episode reward total was -21.000000. running mean: -20.428799
resetting env. episode reward total was -21.000000. running mean: -20.434511
resetting env. episode reward total was -19.000000. running mean: -20.420166
resetting env. episode reward total was -21.000000. running mean: -20.425964
resetting env. episode reward total was -21.000000. running mean: -20.431704
resetting env. episode reward total was -21.000000. running mean: -20.437387
resetting env. episode reward total was -21.000000. running mean: -20.443013
resetting env. episode reward total was -18.000000. running mean: -20.418583
resetting env. episode reward total was -20.000000. running mean: -20.414397
resetting env. episode reward total was -20.000000. running mean: -20.410253
resetting env. episode reward total was -21.000000. running mean: -20.416151
resetting env. episode reward total was -21.000000. running mean: -20.421989

resetting env. episode reward total was -20.000000. running mean: -20.143993
resetting env. episode reward total was -20.000000. running mean: -20.142553
resetting env. episode reward total was -21.000000. running mean: -20.151127
resetting env. episode reward total was -21.000000. running mean: -20.159616
resetting env. episode reward total was -21.000000. running mean: -20.168020
resetting env. episode reward total was -21.000000. running mean: -20.176340
resetting env. episode reward total was -21.000000. running mean: -20.184576
resetting env. episode reward total was -21.000000. running mean: -20.192731
resetting env. episode reward total was -20.000000. running mean: -20.190803
resetting env. episode reward total was -20.000000. running mean: -20.188895
resetting env. episode reward total was -21.000000. running mean: -20.197006
resetting env. episode reward total was -21.000000. running mean: -20.205036
resetting env. episode reward total was -21.000000. running mean: -20.212986

resetting env. episode reward total was -21.000000. running mean: -20.258511
resetting env. episode reward total was -20.000000. running mean: -20.255926
resetting env. episode reward total was -21.000000. running mean: -20.263367
resetting env. episode reward total was -19.000000. running mean: -20.250733
resetting env. episode reward total was -19.000000. running mean: -20.238226
resetting env. episode reward total was -20.000000. running mean: -20.235843
resetting env. episode reward total was -21.000000. running mean: -20.243485
resetting env. episode reward total was -21.000000. running mean: -20.251050
resetting env. episode reward total was -18.000000. running mean: -20.228540
resetting env. episode reward total was -20.000000. running mean: -20.226254
resetting env. episode reward total was -21.000000. running mean: -20.233992
resetting env. episode reward total was -21.000000. running mean: -20.241652
resetting env. episode reward total was -21.000000. running mean: -20.249235

resetting env. episode reward total was -21.000000. running mean: -20.333439
resetting env. episode reward total was -20.000000. running mean: -20.330104
resetting env. episode reward total was -21.000000. running mean: -20.336803
resetting env. episode reward total was -20.000000. running mean: -20.333435
resetting env. episode reward total was -21.000000. running mean: -20.340101
resetting env. episode reward total was -21.000000. running mean: -20.346700
resetting env. episode reward total was -19.000000. running mean: -20.333233
resetting env. episode reward total was -20.000000. running mean: -20.329901
resetting env. episode reward total was -19.000000. running mean: -20.316602
resetting env. episode reward total was -20.000000. running mean: -20.313435
resetting env. episode reward total was -19.000000. running mean: -20.300301
resetting env. episode reward total was -21.000000. running mean: -20.307298
resetting env. episode reward total was -21.000000. running mean: -20.314225

resetting env. episode reward total was -20.000000. running mean: -20.232323
resetting env. episode reward total was -21.000000. running mean: -20.240000
resetting env. episode reward total was -21.000000. running mean: -20.247600
resetting env. episode reward total was -21.000000. running mean: -20.255124
resetting env. episode reward total was -21.000000. running mean: -20.262573
resetting env. episode reward total was -21.000000. running mean: -20.269947
resetting env. episode reward total was -21.000000. running mean: -20.277247
resetting env. episode reward total was -20.000000. running mean: -20.274475
resetting env. episode reward total was -20.000000. running mean: -20.271730
resetting env. episode reward total was -21.000000. running mean: -20.279013
resetting env. episode reward total was -21.000000. running mean: -20.286223
resetting env. episode reward total was -21.000000. running mean: -20.293361
resetting env. episode reward total was -21.000000. running mean: -20.300427

resetting env. episode reward total was -19.000000. running mean: -20.276892
resetting env. episode reward total was -21.000000. running mean: -20.284123
resetting env. episode reward total was -21.000000. running mean: -20.291282
resetting env. episode reward total was -21.000000. running mean: -20.298369
resetting env. episode reward total was -21.000000. running mean: -20.305386
resetting env. episode reward total was -21.000000. running mean: -20.312332
resetting env. episode reward total was -21.000000. running mean: -20.319208
resetting env. episode reward total was -21.000000. running mean: -20.326016
resetting env. episode reward total was -21.000000. running mean: -20.332756
resetting env. episode reward total was -21.000000. running mean: -20.339429
resetting env. episode reward total was -20.000000. running mean: -20.336034
resetting env. episode reward total was -21.000000. running mean: -20.342674
resetting env. episode reward total was -21.000000. running mean: -20.349247

resetting env. episode reward total was -21.000000. running mean: -20.310855
resetting env. episode reward total was -20.000000. running mean: -20.307746
resetting env. episode reward total was -20.000000. running mean: -20.304669
resetting env. episode reward total was -21.000000. running mean: -20.311622
resetting env. episode reward total was -21.000000. running mean: -20.318506
resetting env. episode reward total was -21.000000. running mean: -20.325321
resetting env. episode reward total was -19.000000. running mean: -20.312068
resetting env. episode reward total was -21.000000. running mean: -20.318947
resetting env. episode reward total was -17.000000. running mean: -20.285757
resetting env. episode reward total was -21.000000. running mean: -20.292900
resetting env. episode reward total was -17.000000. running mean: -20.259971
resetting env. episode reward total was -21.000000. running mean: -20.267371
resetting env. episode reward total was -20.000000. running mean: -20.264697

resetting env. episode reward total was -21.000000. running mean: -20.223600
resetting env. episode reward total was -21.000000. running mean: -20.231364
resetting env. episode reward total was -21.000000. running mean: -20.239050
resetting env. episode reward total was -21.000000. running mean: -20.246660
resetting env. episode reward total was -21.000000. running mean: -20.254193
resetting env. episode reward total was -20.000000. running mean: -20.251651
resetting env. episode reward total was -21.000000. running mean: -20.259135
resetting env. episode reward total was -19.000000. running mean: -20.246543
resetting env. episode reward total was -20.000000. running mean: -20.244078
resetting env. episode reward total was -20.000000. running mean: -20.241637
resetting env. episode reward total was -19.000000. running mean: -20.229221
resetting env. episode reward total was -20.000000. running mean: -20.226929
resetting env. episode reward total was -19.000000. running mean: -20.214659

resetting env. episode reward total was -21.000000. running mean: -20.269906
resetting env. episode reward total was -18.000000. running mean: -20.247207
resetting env. episode reward total was -21.000000. running mean: -20.254735
resetting env. episode reward total was -19.000000. running mean: -20.242187
resetting env. episode reward total was -21.000000. running mean: -20.249765
resetting env. episode reward total was -19.000000. running mean: -20.237268
resetting env. episode reward total was -20.000000. running mean: -20.234895
resetting env. episode reward total was -21.000000. running mean: -20.242546
resetting env. episode reward total was -21.000000. running mean: -20.250121
resetting env. episode reward total was -20.000000. running mean: -20.247619
resetting env. episode reward total was -21.000000. running mean: -20.255143
resetting env. episode reward total was -19.000000. running mean: -20.242592
resetting env. episode reward total was -21.000000. running mean: -20.250166

resetting env. episode reward total was -20.000000. running mean: -20.253537
resetting env. episode reward total was -20.000000. running mean: -20.251002
resetting env. episode reward total was -21.000000. running mean: -20.258492
resetting env. episode reward total was -20.000000. running mean: -20.255907
resetting env. episode reward total was -20.000000. running mean: -20.253348
resetting env. episode reward total was -20.000000. running mean: -20.250814
resetting env. episode reward total was -17.000000. running mean: -20.218306
resetting env. episode reward total was -21.000000. running mean: -20.226123
resetting env. episode reward total was -16.000000. running mean: -20.183862
resetting env. episode reward total was -21.000000. running mean: -20.192023
resetting env. episode reward total was -21.000000. running mean: -20.200103
resetting env. episode reward total was -19.000000. running mean: -20.188102
resetting env. episode reward total was -21.000000. running mean: -20.196221

resetting env. episode reward total was -21.000000. running mean: -20.254963
resetting env. episode reward total was -21.000000. running mean: -20.262413
resetting env. episode reward total was -19.000000. running mean: -20.249789
resetting env. episode reward total was -21.000000. running mean: -20.257291
resetting env. episode reward total was -21.000000. running mean: -20.264718
resetting env. episode reward total was -20.000000. running mean: -20.262071
resetting env. episode reward total was -21.000000. running mean: -20.269450
resetting env. episode reward total was -21.000000. running mean: -20.276756
resetting env. episode reward total was -20.000000. running mean: -20.273988
resetting env. episode reward total was -19.000000. running mean: -20.261248
resetting env. episode reward total was -21.000000. running mean: -20.268636
resetting env. episode reward total was -21.000000. running mean: -20.275950
resetting env. episode reward total was -20.000000. running mean: -20.273190

resetting env. episode reward total was -21.000000. running mean: -20.256937
resetting env. episode reward total was -21.000000. running mean: -20.264367
resetting env. episode reward total was -20.000000. running mean: -20.261724
resetting env. episode reward total was -21.000000. running mean: -20.269106
resetting env. episode reward total was -21.000000. running mean: -20.276415
resetting env. episode reward total was -17.000000. running mean: -20.243651
resetting env. episode reward total was -21.000000. running mean: -20.251215
resetting env. episode reward total was -18.000000. running mean: -20.228703
resetting env. episode reward total was -20.000000. running mean: -20.226415
resetting env. episode reward total was -21.000000. running mean: -20.234151
resetting env. episode reward total was -20.000000. running mean: -20.231810
resetting env. episode reward total was -19.000000. running mean: -20.219492
resetting env. episode reward total was -20.000000. running mean: -20.217297

resetting env. episode reward total was -21.000000. running mean: -20.182272
resetting env. episode reward total was -21.000000. running mean: -20.190449
resetting env. episode reward total was -21.000000. running mean: -20.198545
resetting env. episode reward total was -21.000000. running mean: -20.206559
resetting env. episode reward total was -18.000000. running mean: -20.184494
resetting env. episode reward total was -20.000000. running mean: -20.182649
resetting env. episode reward total was -20.000000. running mean: -20.180822
resetting env. episode reward total was -21.000000. running mean: -20.189014
resetting env. episode reward total was -20.000000. running mean: -20.187124
resetting env. episode reward total was -20.000000. running mean: -20.185253
resetting env. episode reward total was -20.000000. running mean: -20.183400
resetting env. episode reward total was -18.000000. running mean: -20.161566
resetting env. episode reward total was -20.000000. running mean: -20.159950

resetting env. episode reward total was -20.000000. running mean: -20.258687
resetting env. episode reward total was -19.000000. running mean: -20.246101
resetting env. episode reward total was -21.000000. running mean: -20.253640
resetting env. episode reward total was -20.000000. running mean: -20.251103
resetting env. episode reward total was -20.000000. running mean: -20.248592
resetting env. episode reward total was -20.000000. running mean: -20.246106
resetting env. episode reward total was -20.000000. running mean: -20.243645
resetting env. episode reward total was -20.000000. running mean: -20.241209
resetting env. episode reward total was -21.000000. running mean: -20.248797
resetting env. episode reward total was -21.000000. running mean: -20.256309
resetting env. episode reward total was -21.000000. running mean: -20.263746
resetting env. episode reward total was -18.000000. running mean: -20.241108
resetting env. episode reward total was -21.000000. running mean: -20.248697

resetting env. episode reward total was -20.000000. running mean: -20.114560
resetting env. episode reward total was -21.000000. running mean: -20.123414
resetting env. episode reward total was -21.000000. running mean: -20.132180
resetting env. episode reward total was -20.000000. running mean: -20.130858
resetting env. episode reward total was -21.000000. running mean: -20.139550
resetting env. episode reward total was -19.000000. running mean: -20.128154
resetting env. episode reward total was -19.000000. running mean: -20.116873
resetting env. episode reward total was -20.000000. running mean: -20.115704
resetting env. episode reward total was -20.000000. running mean: -20.114547
resetting env. episode reward total was -20.000000. running mean: -20.113401
resetting env. episode reward total was -20.000000. running mean: -20.112267
resetting env. episode reward total was -21.000000. running mean: -20.121145
resetting env. episode reward total was -18.000000. running mean: -20.099933

resetting env. episode reward total was -21.000000. running mean: -20.123138
resetting env. episode reward total was -21.000000. running mean: -20.131906
resetting env. episode reward total was -16.000000. running mean: -20.090587
resetting env. episode reward total was -21.000000. running mean: -20.099681
resetting env. episode reward total was -20.000000. running mean: -20.098684
resetting env. episode reward total was -20.000000. running mean: -20.097698
resetting env. episode reward total was -21.000000. running mean: -20.106721
resetting env. episode reward total was -20.000000. running mean: -20.105653
resetting env. episode reward total was -18.000000. running mean: -20.084597
resetting env. episode reward total was -20.000000. running mean: -20.083751
resetting env. episode reward total was -21.000000. running mean: -20.092913
resetting env. episode reward total was -21.000000. running mean: -20.101984
resetting env. episode reward total was -21.000000. running mean: -20.110964

resetting env. episode reward total was -21.000000. running mean: -20.072184
resetting env. episode reward total was -20.000000. running mean: -20.071462
resetting env. episode reward total was -21.000000. running mean: -20.080748
resetting env. episode reward total was -19.000000. running mean: -20.069940
resetting env. episode reward total was -20.000000. running mean: -20.069241
resetting env. episode reward total was -21.000000. running mean: -20.078548
resetting env. episode reward total was -21.000000. running mean: -20.087763
resetting env. episode reward total was -20.000000. running mean: -20.086885
resetting env. episode reward total was -20.000000. running mean: -20.086016
resetting env. episode reward total was -20.000000. running mean: -20.085156
resetting env. episode reward total was -21.000000. running mean: -20.094305
resetting env. episode reward total was -20.000000. running mean: -20.093362
resetting env. episode reward total was -19.000000. running mean: -20.082428

resetting env. episode reward total was -21.000000. running mean: -20.173694
resetting env. episode reward total was -19.000000. running mean: -20.161957
resetting env. episode reward total was -21.000000. running mean: -20.170338
resetting env. episode reward total was -21.000000. running mean: -20.178634
resetting env. episode reward total was -21.000000. running mean: -20.186848
resetting env. episode reward total was -20.000000. running mean: -20.184980
resetting env. episode reward total was -20.000000. running mean: -20.183130
resetting env. episode reward total was -17.000000. running mean: -20.151299
resetting env. episode reward total was -21.000000. running mean: -20.159786
resetting env. episode reward total was -20.000000. running mean: -20.158188
resetting env. episode reward total was -20.000000. running mean: -20.156606
resetting env. episode reward total was -21.000000. running mean: -20.165040
resetting env. episode reward total was -20.000000. running mean: -20.163389

resetting env. episode reward total was -21.000000. running mean: -20.165309
resetting env. episode reward total was -21.000000. running mean: -20.173656
resetting env. episode reward total was -19.000000. running mean: -20.161919
resetting env. episode reward total was -20.000000. running mean: -20.160300
resetting env. episode reward total was -20.000000. running mean: -20.158697
resetting env. episode reward total was -18.000000. running mean: -20.137110
resetting env. episode reward total was -21.000000. running mean: -20.145739
resetting env. episode reward total was -19.000000. running mean: -20.134282
resetting env. episode reward total was -19.000000. running mean: -20.122939
resetting env. episode reward total was -20.000000. running mean: -20.121710
resetting env. episode reward total was -20.000000. running mean: -20.120492
resetting env. episode reward total was -21.000000. running mean: -20.129288
resetting env. episode reward total was -20.000000. running mean: -20.127995

resetting env. episode reward total was -21.000000. running mean: -20.075291
resetting env. episode reward total was -20.000000. running mean: -20.074538
resetting env. episode reward total was -20.000000. running mean: -20.073792
resetting env. episode reward total was -20.000000. running mean: -20.073054
resetting env. episode reward total was -20.000000. running mean: -20.072324
resetting env. episode reward total was -19.000000. running mean: -20.061601
resetting env. episode reward total was -20.000000. running mean: -20.060985
resetting env. episode reward total was -20.000000. running mean: -20.060375
resetting env. episode reward total was -20.000000. running mean: -20.059771
resetting env. episode reward total was -17.000000. running mean: -20.029173
resetting env. episode reward total was -19.000000. running mean: -20.018882
resetting env. episode reward total was -19.000000. running mean: -20.008693
resetting env. episode reward total was -21.000000. running mean: -20.018606

resetting env. episode reward total was -19.000000. running mean: -20.044539
resetting env. episode reward total was -19.000000. running mean: -20.034094
resetting env. episode reward total was -20.000000. running mean: -20.033753
resetting env. episode reward total was -20.000000. running mean: -20.033415
resetting env. episode reward total was -19.000000. running mean: -20.023081
resetting env. episode reward total was -18.000000. running mean: -20.002850
resetting env. episode reward total was -20.000000. running mean: -20.002822
resetting env. episode reward total was -21.000000. running mean: -20.012793
resetting env. episode reward total was -19.000000. running mean: -20.002666
resetting env. episode reward total was -20.000000. running mean: -20.002639
resetting env. episode reward total was -21.000000. running mean: -20.012612
resetting env. episode reward total was -21.000000. running mean: -20.022486
resetting env. episode reward total was -21.000000. running mean: -20.032261

resetting env. episode reward total was -21.000000. running mean: -20.118247
resetting env. episode reward total was -21.000000. running mean: -20.127064
resetting env. episode reward total was -19.000000. running mean: -20.115794
resetting env. episode reward total was -21.000000. running mean: -20.124636
resetting env. episode reward total was -20.000000. running mean: -20.123389
resetting env. episode reward total was -21.000000. running mean: -20.132156
resetting env. episode reward total was -19.000000. running mean: -20.120834
resetting env. episode reward total was -20.000000. running mean: -20.119626
resetting env. episode reward total was -19.000000. running mean: -20.108429
resetting env. episode reward total was -18.000000. running mean: -20.087345
resetting env. episode reward total was -20.000000. running mean: -20.086472
resetting env. episode reward total was -20.000000. running mean: -20.085607
resetting env. episode reward total was -21.000000. running mean: -20.094751

resetting env. episode reward total was -20.000000. running mean: -20.141535
resetting env. episode reward total was -19.000000. running mean: -20.130119
resetting env. episode reward total was -21.000000. running mean: -20.138818
resetting env. episode reward total was -20.000000. running mean: -20.137430
resetting env. episode reward total was -21.000000. running mean: -20.146056
resetting env. episode reward total was -21.000000. running mean: -20.154595
resetting env. episode reward total was -21.000000. running mean: -20.163049
resetting env. episode reward total was -18.000000. running mean: -20.141419
resetting env. episode reward total was -19.000000. running mean: -20.130004
resetting env. episode reward total was -18.000000. running mean: -20.108704
resetting env. episode reward total was -20.000000. running mean: -20.107617
resetting env. episode reward total was -21.000000. running mean: -20.116541
resetting env. episode reward total was -20.000000. running mean: -20.115376

resetting env. episode reward total was -21.000000. running mean: -20.083786
resetting env. episode reward total was -19.000000. running mean: -20.072948
resetting env. episode reward total was -19.000000. running mean: -20.062219
resetting env. episode reward total was -20.000000. running mean: -20.061597
resetting env. episode reward total was -20.000000. running mean: -20.060981
resetting env. episode reward total was -20.000000. running mean: -20.060371
resetting env. episode reward total was -21.000000. running mean: -20.069767
resetting env. episode reward total was -21.000000. running mean: -20.079070
resetting env. episode reward total was -19.000000. running mean: -20.068279
resetting env. episode reward total was -19.000000. running mean: -20.057596
resetting env. episode reward total was -19.000000. running mean: -20.047020
resetting env. episode reward total was -20.000000. running mean: -20.046550
resetting env. episode reward total was -21.000000. running mean: -20.056084

resetting env. episode reward total was -21.000000. running mean: -20.030641
resetting env. episode reward total was -21.000000. running mean: -20.040334
resetting env. episode reward total was -19.000000. running mean: -20.029931
resetting env. episode reward total was -19.000000. running mean: -20.019631
resetting env. episode reward total was -20.000000. running mean: -20.019435
resetting env. episode reward total was -21.000000. running mean: -20.029241
resetting env. episode reward total was -21.000000. running mean: -20.038948
resetting env. episode reward total was -21.000000. running mean: -20.048559
resetting env. episode reward total was -21.000000. running mean: -20.058073
resetting env. episode reward total was -21.000000. running mean: -20.067493
resetting env. episode reward total was -21.000000. running mean: -20.076818
resetting env. episode reward total was -21.000000. running mean: -20.086049
resetting env. episode reward total was -21.000000. running mean: -20.095189

resetting env. episode reward total was -19.000000. running mean: -19.990297
resetting env. episode reward total was -19.000000. running mean: -19.980395
resetting env. episode reward total was -19.000000. running mean: -19.970591
resetting env. episode reward total was -20.000000. running mean: -19.970885
resetting env. episode reward total was -21.000000. running mean: -19.981176
resetting env. episode reward total was -19.000000. running mean: -19.971364
resetting env. episode reward total was -18.000000. running mean: -19.951650
resetting env. episode reward total was -21.000000. running mean: -19.962134
resetting env. episode reward total was -21.000000. running mean: -19.972513
resetting env. episode reward total was -19.000000. running mean: -19.962787
resetting env. episode reward total was -20.000000. running mean: -19.963160
resetting env. episode reward total was -19.000000. running mean: -19.953528
resetting env. episode reward total was -21.000000. running mean: -19.963993

resetting env. episode reward total was -19.000000. running mean: -19.837928
resetting env. episode reward total was -21.000000. running mean: -19.849549
resetting env. episode reward total was -19.000000. running mean: -19.841054
resetting env. episode reward total was -21.000000. running mean: -19.852643
resetting env. episode reward total was -21.000000. running mean: -19.864117
resetting env. episode reward total was -18.000000. running mean: -19.845475
resetting env. episode reward total was -21.000000. running mean: -19.857021
resetting env. episode reward total was -20.000000. running mean: -19.858451
resetting env. episode reward total was -20.000000. running mean: -19.859866
resetting env. episode reward total was -20.000000. running mean: -19.861267
resetting env. episode reward total was -20.000000. running mean: -19.862655
resetting env. episode reward total was -20.000000. running mean: -19.864028
resetting env. episode reward total was -18.000000. running mean: -19.845388

resetting env. episode reward total was -20.000000. running mean: -19.994762
resetting env. episode reward total was -17.000000. running mean: -19.964815
resetting env. episode reward total was -21.000000. running mean: -19.975167
resetting env. episode reward total was -21.000000. running mean: -19.985415
resetting env. episode reward total was -21.000000. running mean: -19.995561
resetting env. episode reward total was -19.000000. running mean: -19.985605
resetting env. episode reward total was -20.000000. running mean: -19.985749
resetting env. episode reward total was -21.000000. running mean: -19.995892
resetting env. episode reward total was -21.000000. running mean: -20.005933
resetting env. episode reward total was -18.000000. running mean: -19.985873
resetting env. episode reward total was -19.000000. running mean: -19.976015
resetting env. episode reward total was -21.000000. running mean: -19.986254
resetting env. episode reward total was -21.000000. running mean: -19.996392

resetting env. episode reward total was -21.000000. running mean: -20.034060
resetting env. episode reward total was -20.000000. running mean: -20.033719
resetting env. episode reward total was -21.000000. running mean: -20.043382
resetting env. episode reward total was -17.000000. running mean: -20.012948
resetting env. episode reward total was -20.000000. running mean: -20.012819
resetting env. episode reward total was -21.000000. running mean: -20.022691
resetting env. episode reward total was -19.000000. running mean: -20.012464
resetting env. episode reward total was -21.000000. running mean: -20.022339
resetting env. episode reward total was -21.000000. running mean: -20.032116
resetting env. episode reward total was -21.000000. running mean: -20.041794
resetting env. episode reward total was -20.000000. running mean: -20.041376
resetting env. episode reward total was -20.000000. running mean: -20.040963
resetting env. episode reward total was -21.000000. running mean: -20.050553

resetting env. episode reward total was -20.000000. running mean: -19.977825
resetting env. episode reward total was -20.000000. running mean: -19.978047
resetting env. episode reward total was -18.000000. running mean: -19.958267
resetting env. episode reward total was -20.000000. running mean: -19.958684
resetting env. episode reward total was -20.000000. running mean: -19.959097
resetting env. episode reward total was -21.000000. running mean: -19.969506
resetting env. episode reward total was -19.000000. running mean: -19.959811
resetting env. episode reward total was -21.000000. running mean: -19.970213
resetting env. episode reward total was -19.000000. running mean: -19.960511
resetting env. episode reward total was -21.000000. running mean: -19.970906
resetting env. episode reward total was -19.000000. running mean: -19.961197
resetting env. episode reward total was -16.000000. running mean: -19.921585
resetting env. episode reward total was -19.000000. running mean: -19.912369

resetting env. episode reward total was -20.000000. running mean: -19.946800
resetting env. episode reward total was -21.000000. running mean: -19.957332
resetting env. episode reward total was -21.000000. running mean: -19.967759
resetting env. episode reward total was -19.000000. running mean: -19.958081
resetting env. episode reward total was -21.000000. running mean: -19.968501
resetting env. episode reward total was -21.000000. running mean: -19.978816
resetting env. episode reward total was -20.000000. running mean: -19.979027
resetting env. episode reward total was -20.000000. running mean: -19.979237
resetting env. episode reward total was -19.000000. running mean: -19.969445
resetting env. episode reward total was -20.000000. running mean: -19.969750
resetting env. episode reward total was -20.000000. running mean: -19.970053
resetting env. episode reward total was -20.000000. running mean: -19.970352
resetting env. episode reward total was -18.000000. running mean: -19.950649

resetting env. episode reward total was -18.000000. running mean: -19.874250
resetting env. episode reward total was -21.000000. running mean: -19.885507
resetting env. episode reward total was -21.000000. running mean: -19.896652
resetting env. episode reward total was -21.000000. running mean: -19.907686
resetting env. episode reward total was -18.000000. running mean: -19.888609
resetting env. episode reward total was -20.000000. running mean: -19.889723
resetting env. episode reward total was -21.000000. running mean: -19.900825
resetting env. episode reward total was -18.000000. running mean: -19.881817
resetting env. episode reward total was -20.000000. running mean: -19.882999
resetting env. episode reward total was -19.000000. running mean: -19.874169
resetting env. episode reward total was -19.000000. running mean: -19.865427
resetting env. episode reward total was -21.000000. running mean: -19.876773
resetting env. episode reward total was -19.000000. running mean: -19.868005

resetting env. episode reward total was -19.000000. running mean: -19.811630
resetting env. episode reward total was -20.000000. running mean: -19.813514
resetting env. episode reward total was -21.000000. running mean: -19.825378
resetting env. episode reward total was -21.000000. running mean: -19.837125
resetting env. episode reward total was -20.000000. running mean: -19.838753
resetting env. episode reward total was -16.000000. running mean: -19.800366
resetting env. episode reward total was -20.000000. running mean: -19.802362
resetting env. episode reward total was -21.000000. running mean: -19.814339
resetting env. episode reward total was -20.000000. running mean: -19.816195
resetting env. episode reward total was -21.000000. running mean: -19.828033
resetting env. episode reward total was -21.000000. running mean: -19.839753
resetting env. episode reward total was -18.000000. running mean: -19.821355
resetting env. episode reward total was -18.000000. running mean: -19.803142

resetting env. episode reward total was -19.000000. running mean: -19.790420
resetting env. episode reward total was -19.000000. running mean: -19.782516
resetting env. episode reward total was -19.000000. running mean: -19.774691
resetting env. episode reward total was -20.000000. running mean: -19.776944
resetting env. episode reward total was -21.000000. running mean: -19.789174
resetting env. episode reward total was -20.000000. running mean: -19.791283
resetting env. episode reward total was -20.000000. running mean: -19.793370
resetting env. episode reward total was -20.000000. running mean: -19.795436
resetting env. episode reward total was -21.000000. running mean: -19.807482
resetting env. episode reward total was -20.000000. running mean: -19.809407
resetting env. episode reward total was -20.000000. running mean: -19.811313
resetting env. episode reward total was -21.000000. running mean: -19.823200
resetting env. episode reward total was -21.000000. running mean: -19.834968

resetting env. episode reward total was -18.000000. running mean: -19.848929
resetting env. episode reward total was -19.000000. running mean: -19.840439
resetting env. episode reward total was -21.000000. running mean: -19.852035
resetting env. episode reward total was -21.000000. running mean: -19.863515
resetting env. episode reward total was -19.000000. running mean: -19.854880
resetting env. episode reward total was -20.000000. running mean: -19.856331
resetting env. episode reward total was -17.000000. running mean: -19.827767
resetting env. episode reward total was -21.000000. running mean: -19.839490
resetting env. episode reward total was -19.000000. running mean: -19.831095
resetting env. episode reward total was -20.000000. running mean: -19.832784
resetting env. episode reward total was -20.000000. running mean: -19.834456
resetting env. episode reward total was -20.000000. running mean: -19.836111
resetting env. episode reward total was -19.000000. running mean: -19.827750

resetting env. episode reward total was -21.000000. running mean: -19.759009
resetting env. episode reward total was -20.000000. running mean: -19.761419
resetting env. episode reward total was -21.000000. running mean: -19.773805
resetting env. episode reward total was -20.000000. running mean: -19.776067
resetting env. episode reward total was -21.000000. running mean: -19.788306
resetting env. episode reward total was -19.000000. running mean: -19.780423
resetting env. episode reward total was -20.000000. running mean: -19.782619
resetting env. episode reward total was -19.000000. running mean: -19.774793
resetting env. episode reward total was -20.000000. running mean: -19.777045
resetting env. episode reward total was -19.000000. running mean: -19.769275
resetting env. episode reward total was -19.000000. running mean: -19.761582
resetting env. episode reward total was -21.000000. running mean: -19.773966
resetting env. episode reward total was -21.000000. running mean: -19.786226

resetting env. episode reward total was -20.000000. running mean: -19.745644
resetting env. episode reward total was -19.000000. running mean: -19.738187
resetting env. episode reward total was -19.000000. running mean: -19.730805
resetting env. episode reward total was -20.000000. running mean: -19.733497
resetting env. episode reward total was -19.000000. running mean: -19.726162
resetting env. episode reward total was -20.000000. running mean: -19.728901
resetting env. episode reward total was -19.000000. running mean: -19.721612
resetting env. episode reward total was -20.000000. running mean: -19.724396
resetting env. episode reward total was -20.000000. running mean: -19.727152
resetting env. episode reward total was -18.000000. running mean: -19.709880
resetting env. episode reward total was -20.000000. running mean: -19.712781
resetting env. episode reward total was -18.000000. running mean: -19.695653
resetting env. episode reward total was -17.000000. running mean: -19.668697

resetting env. episode reward total was -20.000000. running mean: -19.692044
resetting env. episode reward total was -21.000000. running mean: -19.705123
resetting env. episode reward total was -21.000000. running mean: -19.718072
resetting env. episode reward total was -20.000000. running mean: -19.720891
resetting env. episode reward total was -20.000000. running mean: -19.723683
resetting env. episode reward total was -18.000000. running mean: -19.706446
resetting env. episode reward total was -18.000000. running mean: -19.689381
resetting env. episode reward total was -17.000000. running mean: -19.662487
resetting env. episode reward total was -20.000000. running mean: -19.665863
resetting env. episode reward total was -21.000000. running mean: -19.679204
resetting env. episode reward total was -19.000000. running mean: -19.672412
resetting env. episode reward total was -18.000000. running mean: -19.655688
resetting env. episode reward total was -21.000000. running mean: -19.669131

resetting env. episode reward total was -20.000000. running mean: -19.667926
resetting env. episode reward total was -21.000000. running mean: -19.681247
resetting env. episode reward total was -21.000000. running mean: -19.694434
resetting env. episode reward total was -20.000000. running mean: -19.697490
resetting env. episode reward total was -21.000000. running mean: -19.710515
resetting env. episode reward total was -21.000000. running mean: -19.723410
resetting env. episode reward total was -21.000000. running mean: -19.736176
resetting env. episode reward total was -19.000000. running mean: -19.728814
resetting env. episode reward total was -19.000000. running mean: -19.721526
resetting env. episode reward total was -21.000000. running mean: -19.734311
resetting env. episode reward total was -18.000000. running mean: -19.716967
resetting env. episode reward total was -19.000000. running mean: -19.709798
resetting env. episode reward total was -19.000000. running mean: -19.702700

resetting env. episode reward total was -21.000000. running mean: -19.712016
resetting env. episode reward total was -20.000000. running mean: -19.714895
resetting env. episode reward total was -20.000000. running mean: -19.717747
resetting env. episode reward total was -21.000000. running mean: -19.730569
resetting env. episode reward total was -20.000000. running mean: -19.733263
resetting env. episode reward total was -20.000000. running mean: -19.735931
resetting env. episode reward total was -21.000000. running mean: -19.748571
resetting env. episode reward total was -19.000000. running mean: -19.741086
resetting env. episode reward total was -21.000000. running mean: -19.753675
resetting env. episode reward total was -19.000000. running mean: -19.746138
resetting env. episode reward total was -19.000000. running mean: -19.738677
resetting env. episode reward total was -21.000000. running mean: -19.751290
resetting env. episode reward total was -20.000000. running mean: -19.753777

In [28]:
env.close()