In [1]:
# References
"""
https://tensorflowkorea.gitbooks.io/tensorflow-kr/content/g3doc/api_docs/python/constant_op.html
https://www.youtube.com/watch?v=6CCXyfvubvY&t=0s&list=PLlMkM4tgfjnLSOjrEJN31gZATbcj_MpUm&index=35
Game environment: http://solarisailab.com/archives/486 
"""

'\nhttps://tensorflowkorea.gitbooks.io/tensorflow-kr/content/g3doc/api_docs/python/constant_op.html\nhttps://www.youtube.com/watch?v=6CCXyfvubvY&t=0s&list=PLlMkM4tgfjnLSOjrEJN31gZATbcj_MpUm&index=35\nGame environment: http://solarisailab.com/archives/486 \n'

In [2]:
# Library
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import random
import math
import os

In [3]:
# Parameters
Actions = 3 # The number of actions(left, stay, right)
episodes = 2000 # The number of games
hiddenSize = 100 # Number of neurons in the hidden layers.
Memory = 500 # How large should the memory be (where it stores its past experiences).
batchSize = 50 # The mini-batch size for training.
gridSize = 10 # The grid size of the game.
input_size = gridSize * gridSize # We eventually flatten to a 1d tensor to feed the network.
discount = 0.9 # The discount is used to force the network to choose states that lead to the reward quicker (0 to 1)  
learningRate = 0.2 # Learning Rate for Stochastic Gradient Descent (our optimizer).

In [4]:
# Create the base model.
X = tf.placeholder(tf.float32, [None, input_size])

# First layer of weights
W1 = tf.Variable(tf.truncated_normal([input_size, hiddenSize], stddev=1.0 / math.sqrt(float(input_size))))
b1 = tf.Variable(tf.truncated_normal([hiddenSize], stddev=0.01))  
input_layer = tf.nn.relu(tf.matmul(X, W1) + b1)

# Second layer of weights
W2 = tf.Variable(tf.truncated_normal([hiddenSize, hiddenSize],stddev=1.0 / math.sqrt(float(hiddenSize))))
b2 = tf.Variable(tf.truncated_normal([hiddenSize], stddev=0.01))
hidden_layer = tf.nn.relu(tf.matmul(input_layer, W2) + b2)

# Third layer of weights
W3 = tf.Variable(tf.truncated_normal([hiddenSize, Actions],stddev=1.0 / math.sqrt(float(hiddenSize))))
b3 = tf.Variable(tf.truncated_normal([Actions], stddev=0.01))
output_layer = tf.matmul(hidden_layer, W3) + b3

Y = tf.placeholder(tf.float32, [None, Actions])

cost = tf.reduce_sum(tf.square(Y-output_layer)) / (2 * batchSize)

optimizer = tf.train.GradientDescentOptimizer(learningRate).minimize(cost)

In [5]:
# The environment: Handles interactions and contains the state of the environment
class CatchEnvironment():
    def __init__(self, gridSize):
        self.gridSize = gridSize
        self.input_size = self.gridSize * self.gridSize
        self.state = np.empty(3, dtype = np.uint8) 

    # Returns the state of the environment.
    def observe(self):
        canvas = self.drawState()
        canvas = np.reshape(canvas, (-1,self.input_size))
        return canvas

    def drawState(self):
        canvas = np.zeros((self.gridSize, self.gridSize))
        canvas[self.state[0]-1, self.state[1]-1] = 1  # Draw the fruit.
        # Draw the basket. The basket takes the adjacent two places to the position of basket.
        canvas[self.gridSize-1, self.state[2] -1 - 1] = 1
        canvas[self.gridSize-1, self.state[2] -1] = 1
        canvas[self.gridSize-1, self.state[2] -1 + 1] = 1    
        return canvas        

    # Resets the environment. Randomly initialise the fruit position (always at the top to begin with) and bucket.
    def reset(self): 
        initialFruitColumn = random.randrange(1, self.gridSize + 1)
        initialBucketPosition = random.randrange(2, self.gridSize + 1 - 1)
        self.state = np.array([1, initialFruitColumn, initialBucketPosition]) 
        return self.getState()

    def getState(self):
        stateInfo = self.state
        fruit_row = stateInfo[0]
        fruit_col = stateInfo[1]
        basket = stateInfo[2]
        return fruit_row, fruit_col, basket

    # Returns the award that the agent has gained for being in the current environment state.
    def getReward(self):
        fruitRow, fruitColumn, basket = self.getState()
        if (fruitRow == self.gridSize - 1):  # If the fruit has reached the bottom.
            if (abs(fruitColumn - basket) <= 1): # Check if the basket caught the fruit.
                return 1
            else:
                return -1
        else:
            return 0

    def isGameOver(self):
        if (self.state[0] == self.gridSize - 1): 
            return True 
        else: 
            return False 

    def updateState(self, action):
        if (action == 1):
            action = -1
        elif (action == 2):
            action = 0
        else:
            action = 1
        fruitRow, fruitColumn, basket = self.getState()
        newBasket = min(max(2, basket + action), self.gridSize - 1) # The min/max prevents the basket from moving out of the grid.
        fruitRow = fruitRow + 1  # The fruit is falling by 1 every action.
        self.state = np.array([fruitRow, fruitColumn, newBasket])
        
    #Action can be 1 (move left) or 2 (move right)
    def act(self, action):
        self.updateState(action)
        reward = self.getReward()
        gameOver = self.isGameOver()
        return self.observe(), reward, gameOver, self.getState()   # For purpose of the visual, I also return the state.

In [6]:
# The memory: Handles the internal memory that we add experiences that occur based on agent's actions,
# and creates batches of experiences based on the mini-batch size for training.
class ReplayMemory:
    def __init__(self, gridSize, Memory, discount):
        self.Memory = Memory
        self.gridSize = gridSize
        self.input_size = self.gridSize * self.gridSize
        self.discount = discount
        canvas = np.zeros((self.gridSize, self.gridSize))
        canvas = np.reshape(canvas, (-1,self.input_size))
        self.inputState = np.empty((self.Memory, 100), dtype = np.float32)
        self.actions = np.zeros(self.Memory, dtype = np.uint8)
        self.nextState = np.empty((self.Memory, 100), dtype = np.float32)
        self.gameOver = np.empty(self.Memory, dtype = np.bool)
        self.rewards = np.empty(self.Memory, dtype = np.int8) 
        self.count = 0
        self.current = 0

    # Appends the experience to the memory.
    def remember(self, currentState, action, reward, nextState, gameOver):
        self.actions[self.current] = action
        self.rewards[self.current] = reward
        self.inputState[self.current, ...] = currentState
        self.nextState[self.current, ...] = nextState
        self.gameOver[self.current] = gameOver
        self.count = max(self.count, self.current + 1)
        self.current = (self.current + 1) % self.Memory

    def getBatch(self, model, batchSize, Actions, input_size, sess, X):
    
        # We check to see if we have enough memory inputs to make an entire batch, if not we create the biggest
        # batch we can (at the beginning of training we will not have enough experience to fill a batch).
        memoryLength = self.count
        chosenBatchSize = min(batchSize, memoryLength)

        inputs = np.zeros((chosenBatchSize, input_size))
        targets = np.zeros((chosenBatchSize, Actions))

        # Fill the inputs and targets up.
        for i in range(chosenBatchSize):
            if memoryLength == 1:
                memoryLength = 2
            # Choose a random memory experience to add to the batch.
            randomIndex = random.randrange(1, memoryLength)
            current_inputState = np.reshape(self.inputState[randomIndex], (1, 100))

            target = sess.run(model, feed_dict={X: current_inputState})
            
            current_nextState =  np.reshape(self.nextState[randomIndex], (1, 100))
            current_outputs = sess.run(model, feed_dict={X: current_nextState})      
            
            # Gives us Q, the max Q for the next state.
            nextStateMaxQ = np.amax(current_outputs)
            if (self.gameOver[randomIndex] == True):
                target[0, [self.actions[randomIndex]-1]] = self.rewards[randomIndex]
            else:
                # reward + discount(gamma) * max_a' Q(s',a')
                # We are setting the Q-value for the action to  r + gamma*max a' Q(s', a'). The rest stay the same
                # to give an error of 0 for those outputs.
                target[0, [self.actions[randomIndex]-1]] = self.rewards[randomIndex] + self.discount * nextStateMaxQ

            # Update the inputs and targets.
            inputs[i] = current_inputState
            targets[i] = target

        return inputs, targets

In [7]:
def main():
    err_list = []
    score_list = []
    print("Training new model")

    # Define Environment
    env = CatchEnvironment(gridSize)

    # Define Replay Memory
    memory = ReplayMemory(gridSize, Memory, discount)

    # Add ops to save and restore all the variables.
    saver = tf.train.Saver()
    
    winCount = 0
    with tf.Session() as sess:   
        tf.global_variables_initializer().run()

        for i in range(episodes):
            # Initialize the environment.
            e = 1. / ((episodes / 10) + 1)
            err = 0
            env.reset()
            
            isGameOver = False

            # The initial state of the environment.
            currentState = env.observe()
            
            while (isGameOver != True):
                action = -9999  # action initilization
                # Decides if we should choose a random action, or an action from the policy network.
                if np.random.rand(1) < e:
                    action = random.randrange(1, Actions+1)
                else:          
                    # Forward the current state through the network.
                    q = sess.run(output_layer, feed_dict={X: currentState})          
                    # Find the max index (the chosen action).
                    index = q.argmax()
                    action = index + 1     
        
                nextState, reward, gameOver, stateInfo = env.act(action)
            
                if (reward == 1):
                    winCount = winCount + 1

                memory.remember(currentState, action, reward, nextState, gameOver)
        
                # Update the current state and if the game is over.
                currentState = nextState
                isGameOver = gameOver
                
                # We get a batch of training data to train the model.
                inputs, targets = memory.getBatch(output_layer, batchSize, Actions, input_size, sess, X)
        
                # Train the network which returns the error.
                _, loss = sess.run([optimizer, cost], feed_dict={X: inputs, Y: targets})  
                err = err + loss
    
            err_list.append(err)
            score_list.append(winCount)
        
            print("Episode " + str(i) + ": reward = " + str(winCount) + " err = " + str(err))

In [8]:
if __name__ == '__main__':
    main()

Training new model
Episode 0: reward = 0 err = 0.0110636350721
Episode 1: reward = 0 err = 0.0719164234488
Episode 2: reward = 0 err = 0.180526711978
Episode 3: reward = 0 err = 0.233190221712
Episode 4: reward = 0 err = 0.196437946521
Episode 5: reward = 0 err = 0.24689606484
Episode 6: reward = 1 err = 0.19026269624
Episode 7: reward = 2 err = 0.222813867033
Episode 8: reward = 2 err = 0.305231183767
Episode 9: reward = 2 err = 0.347439421341
Episode 10: reward = 2 err = 0.290931546129
Episode 11: reward = 2 err = 0.31633673422
Episode 12: reward = 3 err = 0.31713533029
Episode 13: reward = 3 err = 0.239454433322
Episode 14: reward = 3 err = 0.270022586919
Episode 15: reward = 3 err = 0.219919445924
Episode 16: reward = 3 err = 0.326902993023
Episode 17: reward = 3 err = 0.320353019983
Episode 18: reward = 3 err = 0.330730538815
Episode 19: reward = 4 err = 0.261226082221
Episode 20: reward = 4 err = 0.389124816284
Episode 21: reward = 4 err = 0.237657755613
Episode 22: reward = 4 er

Episode 180: reward = 75 err = 0.0328891300596
Episode 181: reward = 76 err = 0.0279679880477
Episode 182: reward = 76 err = 0.0326116114156
Episode 183: reward = 77 err = 0.0354595780373
Episode 184: reward = 78 err = 0.0259091677144
Episode 185: reward = 79 err = 0.0272868989268
Episode 186: reward = 80 err = 0.0201698115561
Episode 187: reward = 80 err = 0.0349222084042
Episode 188: reward = 80 err = 0.0311415730976
Episode 189: reward = 81 err = 0.0352669381537
Episode 190: reward = 82 err = 0.0311138581019
Episode 191: reward = 82 err = 0.0294114605058
Episode 192: reward = 82 err = 0.0212377795251
Episode 193: reward = 83 err = 0.0356773680542
Episode 194: reward = 84 err = 0.0216702789767
Episode 195: reward = 84 err = 0.0212733568624
Episode 196: reward = 84 err = 0.0212586895796
Episode 197: reward = 85 err = 0.0184515605215
Episode 198: reward = 86 err = 0.0315940529108
Episode 199: reward = 87 err = 0.0155260921456
Episode 200: reward = 88 err = 0.020449549309
Episode 201: r

Episode 353: reward = 209 err = 0.0439342583995
Episode 354: reward = 210 err = 0.0276370078791
Episode 355: reward = 211 err = 0.023688929854
Episode 356: reward = 212 err = 0.0249164691195
Episode 357: reward = 213 err = 0.0288876636187
Episode 358: reward = 214 err = 0.0254459875869
Episode 359: reward = 215 err = 0.0254122311017
Episode 360: reward = 216 err = 0.0158271621913
Episode 361: reward = 217 err = 0.0207849787548
Episode 362: reward = 218 err = 0.0231565865688
Episode 363: reward = 219 err = 0.0150357143721
Episode 364: reward = 220 err = 0.0156134415884
Episode 365: reward = 221 err = 0.0227802179288
Episode 366: reward = 222 err = 0.0278826736612
Episode 367: reward = 223 err = 0.0144698646618
Episode 368: reward = 224 err = 0.0208563024644
Episode 369: reward = 225 err = 0.0131232624408
Episode 370: reward = 226 err = 0.0177234975272
Episode 371: reward = 227 err = 0.0125669012778
Episode 372: reward = 228 err = 0.0125037915423
Episode 373: reward = 229 err = 0.0169372

Episode 523: reward = 378 err = 0.00177510239155
Episode 524: reward = 379 err = 0.00137749566056
Episode 525: reward = 380 err = 0.00175673203194
Episode 526: reward = 381 err = 0.0017699737873
Episode 527: reward = 382 err = 0.00186274306179
Episode 528: reward = 383 err = 0.00179688993376
Episode 529: reward = 384 err = 0.00118531578482
Episode 530: reward = 385 err = 0.00148356207137
Episode 531: reward = 386 err = 0.00126620050287
Episode 532: reward = 387 err = 0.00153306766151
Episode 533: reward = 388 err = 0.00138759580295
Episode 534: reward = 389 err = 0.00113932025124
Episode 535: reward = 390 err = 0.00162527975044
Episode 536: reward = 391 err = 0.00182505331759
Episode 537: reward = 392 err = 0.00121300622413
Episode 538: reward = 393 err = 0.00162171351985
Episode 539: reward = 394 err = 0.00194473583542
Episode 540: reward = 395 err = 0.00122274117894
Episode 541: reward = 396 err = 0.00118078335436
Episode 542: reward = 397 err = 0.00118301491966
Episode 543: reward =

Episode 691: reward = 545 err = 0.00240214652877
Episode 692: reward = 546 err = 0.0032419635827
Episode 693: reward = 547 err = 0.002291183544
Episode 694: reward = 548 err = 0.000838136249513
Episode 695: reward = 549 err = 0.000961069486948
Episode 696: reward = 550 err = 0.0010812163091
Episode 697: reward = 551 err = 0.0011277904232
Episode 698: reward = 552 err = 0.000675767256325
Episode 699: reward = 553 err = 0.000523120648722
Episode 700: reward = 554 err = 0.000541820008948
Episode 701: reward = 555 err = 0.000643568399028
Episode 702: reward = 556 err = 0.000422810708187
Episode 703: reward = 557 err = 0.000555671958864
Episode 704: reward = 558 err = 0.00142732353197
Episode 705: reward = 559 err = 0.0008861223032
Episode 706: reward = 560 err = 0.000790030768258
Episode 707: reward = 561 err = 0.000487485765916
Episode 708: reward = 562 err = 0.000822378944576
Episode 709: reward = 563 err = 0.000696957460605
Episode 710: reward = 564 err = 0.000631531767795
Episode 711: 

Episode 856: reward = 710 err = 0.000988469793811
Episode 857: reward = 711 err = 0.000649959823932
Episode 858: reward = 712 err = 0.000848997391586
Episode 859: reward = 713 err = 0.000859685551404
Episode 860: reward = 714 err = 0.00070533131111
Episode 861: reward = 715 err = 0.000557406036023
Episode 862: reward = 716 err = 0.000270285125225
Episode 863: reward = 717 err = 0.00047412267304
Episode 864: reward = 718 err = 0.00060201407905
Episode 865: reward = 719 err = 0.000264909194811
Episode 866: reward = 720 err = 0.000578076638703
Episode 867: reward = 721 err = 0.000262998875769
Episode 868: reward = 722 err = 0.000578426019274
Episode 869: reward = 723 err = 0.000610930364928
Episode 870: reward = 724 err = 0.000227865430134
Episode 871: reward = 725 err = 0.000746404604797
Episode 872: reward = 726 err = 0.000560464895898
Episode 873: reward = 727 err = 0.00038290084558
Episode 874: reward = 728 err = 0.000450223031294
Episode 875: reward = 729 err = 0.000494669407999
Epis

Episode 1020: reward = 874 err = 0.000380130451958
Episode 1021: reward = 875 err = 0.000673973358971
Episode 1022: reward = 876 err = 0.000263626170181
Episode 1023: reward = 877 err = 0.000238280952544
Episode 1024: reward = 878 err = 0.00274081735006
Episode 1025: reward = 879 err = 0.000922970149986
Episode 1026: reward = 880 err = 0.00050645499141
Episode 1027: reward = 881 err = 0.000317273950714
Episode 1028: reward = 882 err = 0.00032838698462
Episode 1029: reward = 883 err = 0.000302011679196
Episode 1030: reward = 884 err = 0.000471908871077
Episode 1031: reward = 885 err = 0.000225562812375
Episode 1032: reward = 886 err = 0.000679888507875
Episode 1033: reward = 887 err = 0.000272084575954
Episode 1034: reward = 888 err = 0.000375830323719
Episode 1035: reward = 889 err = 0.000766538893913
Episode 1036: reward = 890 err = 0.000367594974705
Episode 1037: reward = 891 err = 0.000248056869168
Episode 1038: reward = 892 err = 0.00060174859118
Episode 1039: reward = 893 err = 0.

Episode 1181: reward = 1035 err = 4.44631025402e-05
Episode 1182: reward = 1036 err = 8.44108528781e-05
Episode 1183: reward = 1037 err = 6.55908097542e-05
Episode 1184: reward = 1038 err = 6.53780243738e-05
Episode 1185: reward = 1039 err = 4.42322291292e-05
Episode 1186: reward = 1040 err = 5.41273891486e-05
Episode 1187: reward = 1041 err = 6.49045873615e-05
Episode 1188: reward = 1042 err = 6.70661804634e-05
Episode 1189: reward = 1043 err = 9.12709117529e-05
Episode 1190: reward = 1044 err = 5.68215523344e-05
Episode 1191: reward = 1045 err = 6.71667517054e-05
Episode 1192: reward = 1046 err = 4.74514472444e-05
Episode 1193: reward = 1047 err = 5.43611849935e-05
Episode 1194: reward = 1048 err = 6.35496660379e-05
Episode 1195: reward = 1049 err = 0.000110713948061
Episode 1196: reward = 1050 err = 9.78953645472e-05
Episode 1197: reward = 1051 err = 6.24568441481e-05
Episode 1198: reward = 1052 err = 0.00374333481341
Episode 1199: reward = 1053 err = 0.00275816167368
Episode 1200: 

Episode 1340: reward = 1194 err = 3.39671823895e-05
Episode 1341: reward = 1195 err = 5.05526272718e-05
Episode 1342: reward = 1196 err = 3.69579001926e-05
Episode 1343: reward = 1197 err = 2.99761635461e-05
Episode 1344: reward = 1198 err = 3.58281752142e-05
Episode 1345: reward = 1199 err = 2.94722723311e-05
Episode 1346: reward = 1200 err = 2.32336828958e-05
Episode 1347: reward = 1201 err = 3.76821293457e-05
Episode 1348: reward = 1202 err = 4.12366341607e-05
Episode 1349: reward = 1203 err = 4.74841003779e-05
Episode 1350: reward = 1204 err = 0.00515069842356
Episode 1351: reward = 1205 err = 0.000211287038837
Episode 1352: reward = 1206 err = 0.0109547888114
Episode 1353: reward = 1207 err = 0.00618848421436
Episode 1354: reward = 1208 err = 0.00522048994753
Episode 1355: reward = 1209 err = 0.00126503733918
Episode 1356: reward = 1210 err = 0.000819858107207
Episode 1357: reward = 1211 err = 0.000334478800141
Episode 1358: reward = 1212 err = 0.000507001095684
Episode 1359: rewa

Episode 1501: reward = 1354 err = 0.00102607176814
Episode 1502: reward = 1355 err = 0.000766658598877
Episode 1503: reward = 1356 err = 0.00226165328786
Episode 1504: reward = 1357 err = 0.000874913002917
Episode 1505: reward = 1358 err = 0.000711504055289
Episode 1506: reward = 1359 err = 0.00108549935976
Episode 1507: reward = 1360 err = 0.000978111853328
Episode 1508: reward = 1361 err = 0.00158457882208
Episode 1509: reward = 1362 err = 0.000889060363988
Episode 1510: reward = 1363 err = 0.000754009077355
Episode 1511: reward = 1364 err = 0.00125728395869
Episode 1512: reward = 1365 err = 0.00122090492732
Episode 1513: reward = 1366 err = 0.00124218492056
Episode 1514: reward = 1367 err = 0.00177653858191
Episode 1515: reward = 1368 err = 0.00278649245229
Episode 1516: reward = 1369 err = 0.00173738369995
Episode 1517: reward = 1370 err = 0.00188130815513
Episode 1518: reward = 1371 err = 0.00162618644572
Episode 1519: reward = 1372 err = 0.00101303362317
Episode 1520: reward = 13

Episode 1661: reward = 1514 err = 0.000658448552713
Episode 1662: reward = 1515 err = 0.000354266685463
Episode 1663: reward = 1516 err = 0.000470094990305
Episode 1664: reward = 1517 err = 0.000380893567126
Episode 1665: reward = 1518 err = 0.00051105503735
Episode 1666: reward = 1519 err = 0.000507930566528
Episode 1667: reward = 1520 err = 0.000272922396107
Episode 1668: reward = 1521 err = 0.000407119267038
Episode 1669: reward = 1522 err = 0.000296718342724
Episode 1670: reward = 1523 err = 0.000323711492456
Episode 1671: reward = 1524 err = 0.000243855243752
Episode 1672: reward = 1525 err = 0.000357016928319
Episode 1673: reward = 1526 err = 0.000458752700069
Episode 1674: reward = 1527 err = 0.00023674116801
Episode 1675: reward = 1528 err = 0.000399252769057
Episode 1676: reward = 1529 err = 0.000315005509037
Episode 1677: reward = 1530 err = 0.000313208285661
Episode 1678: reward = 1531 err = 0.000356334305252
Episode 1679: reward = 1532 err = 0.000216520211325
Episode 1680: 

Episode 1820: reward = 1673 err = 0.000267855260972
Episode 1821: reward = 1674 err = 0.000281282462311
Episode 1822: reward = 1675 err = 0.000149032569425
Episode 1823: reward = 1676 err = 0.000189712163774
Episode 1824: reward = 1677 err = 0.00021671083141
Episode 1825: reward = 1678 err = 0.000175960043634
Episode 1826: reward = 1679 err = 0.000105038919173
Episode 1827: reward = 1680 err = 0.000173028990503
Episode 1828: reward = 1681 err = 0.000123739039736
Episode 1829: reward = 1682 err = 0.000114314450002
Episode 1830: reward = 1683 err = 0.00011333426437
Episode 1831: reward = 1684 err = 0.000151125048433
Episode 1832: reward = 1685 err = 0.000125783267777
Episode 1833: reward = 1686 err = 9.1029381565e-05
Episode 1834: reward = 1687 err = 8.4372361016e-05
Episode 1835: reward = 1688 err = 9.47850094235e-05
Episode 1836: reward = 1689 err = 0.000109741700726
Episode 1837: reward = 1690 err = 8.57035743138e-05
Episode 1838: reward = 1691 err = 9.7995264241e-05
Episode 1839: rew

Episode 1979: reward = 1832 err = 0.00010685210782
Episode 1980: reward = 1833 err = 0.00010757545806
Episode 1981: reward = 1834 err = 0.000142929497088
Episode 1982: reward = 1835 err = 0.000116510670978
Episode 1983: reward = 1836 err = 0.000118618690522
Episode 1984: reward = 1837 err = 0.000100865887362
Episode 1985: reward = 1838 err = 0.000106263331872
Episode 1986: reward = 1839 err = 8.47145006446e-05
Episode 1987: reward = 1840 err = 8.88416820999e-05
Episode 1988: reward = 1841 err = 0.000120217428503
Episode 1989: reward = 1842 err = 7.98398400548e-05
Episode 1990: reward = 1843 err = 0.000110231873123
Episode 1991: reward = 1844 err = 0.000110768495688
Episode 1992: reward = 1845 err = 8.66899354151e-05
Episode 1993: reward = 1846 err = 0.000106472715288
Episode 1994: reward = 1847 err = 7.4836955946e-05
Episode 1995: reward = 1848 err = 6.2500482727e-05
Episode 1996: reward = 1849 err = 8.29528557915e-05
Episode 1997: reward = 1850 err = 8.82786634975e-05
Episode 1998: re