# Reinforcement Learning Simulation in OpenAI Gym Environment

## 1. CartPole-v1

In [16]:
import gym
import random
import math
import time
import numpy as np
import pandas as pd
from IPython.display import clear_output

env = gym.make("CartPole-v1")
print(env.observation_space)
print(env.action_space)

Box(4,)
Discrete(2)


In [2]:
class Agent():
    def __init__(self, env):
        self.is_discrete = type(env.action_space) == gym.spaces.discrete.Discrete
        self.env_discrete = type(env.observation_space) == gym.spaces.discrete.Discrete
        
        if self.is_discrete:
            self.action_size = env.action_space.n
            print("||Action-Space|| = {}".format(self.action_size))
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
            print("||Action-Space|| range:", self.action_low, self.action_high)
            
        if self.env_discrete:
            self.state_size = env.observation_space.n
            print("||State-Space|| = {}".format(self.state_size))
        else:
            self.state_low = env.observation_space.low
            self.state_high = env.observation_space.high
            self.state_shape = env.observation_space.shape
            print("||State-Space|| range:", self.state_low, self.state_high)      
        
        self.states = env.observation_space
        self.actions = env.action_space        
    
    def get_action_random(self, state):
        if self.is_discrete:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low, self.action_high, self.action_shape)
        return action
    
    def get_action(self, state):
        pole_angle = state[2]
        action = 0 if pole_angle < 0 else 1
        return action
    
myagent = Agent(env)

||Action-Space|| = 2
||State-Space|| range: [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38] [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]


In [3]:
n = int(input("Enter the no. of episodes for simulation: "))

for episode in range(n):
    state = env.reset()
    for t in range(10000):
        env.render()
        print("In State: ", state)
        action = myagent.get_action(state)
        state, reward, done, info = env.step(action)
        if done:
            print("Episode {} done in {} timesteps!\n".format(episode+1, t+1))
            break
    else:
        print("Episode {} NOT done in limit of {} timesteps! Agent Terminated!\n".format(episode+1, t+1))
    
env.close()   

Enter the no. of episodes for simulation: 5
In State:  [-0.02678381  0.01326885  0.04522186 -0.03733366]
In State:  [-0.02651843  0.20771412  0.04447518 -0.31541269]
In State:  [-0.02236415  0.40217526  0.03816693 -0.59374457]
In State:  [-0.01432064  0.59674277  0.02629204 -0.87416493]
In State:  [-0.00238579  0.79149758  0.00880874 -1.15846721]
In State:  [ 0.01344416  0.98650363 -0.0143606  -1.4483752 ]
In State:  [ 0.03317424  0.79156116 -0.04332811 -1.16021351]
In State:  [ 0.04900546  0.59702964 -0.06653238 -0.88142457]
In State:  [ 0.06094605  0.40287145 -0.08416087 -0.61037773]
In State:  [ 0.06900348  0.20902047 -0.09636842 -0.34534455]
In State:  [ 0.07318389  0.01539202 -0.10327532 -0.08453796]
In State:  [ 0.07349173 -0.17810957 -0.10496607  0.17385891]
In State:  [ 0.06992954 -0.37158487 -0.1014889   0.43167118]
In State:  [ 0.06249784 -0.56513434 -0.09285547  0.69071496]
In State:  [ 0.05119515 -0.75885355 -0.07904117  0.95278166]
In State:  [ 0.03601808 -0.95282809 -0.05

In State:  [ 0.06877519 -0.39017131 -0.07860984  0.54032877]
In State:  [ 0.06097176 -0.58410538 -0.06780326  0.80724403]
In State:  [ 0.04928966 -0.77823578 -0.05165838  1.07785164]
In State:  [ 0.03372494 -0.97263879 -0.03010135  1.35388635]
In State:  [ 0.01427216 -1.16737022 -0.00302362  1.63700271]
In State:  [-0.00907524 -1.36245658  0.02971643  1.92874197]
In State:  [-0.03632437 -1.1676653   0.06829127  1.6454194 ]
In State:  [-0.05967768 -0.97340565  0.10119966  1.37477117]
In State:  [-0.07914579 -0.77968355  0.12869508  1.11537668]
In State:  [-0.09473946 -0.58646407  0.15100261  0.86567627]
In State:  [-0.10646874 -0.39368417  0.16831614  0.6240232 ]
In State:  [-0.11434243 -0.20126227  0.1807966   0.38872086]
In State:  [-0.11836767 -0.00910517  0.18857102  0.15804833]
In State:  [-0.11854978  0.18288698  0.19173199 -0.06972285]
In State:  [-0.11489204  0.37481704  0.19033753 -0.29631799]
In State:  [-0.10739569  0.56678816  0.18441117 -0.52345234]
In State:  [-0.09605993 

## Q Learning

In [43]:
class QLAgent(Agent):
    def __init__(self, env, size=(1, 1, 6, 12,), discount_rate=0.97, learning_rate=0.01):
        super().__init__(env)        
        
        self.eps = 1.0
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        
        self.state_size = size
        
        self.q_table = 1e-4*np.zeros(self.state_size + (self.action_size,))
        print("QTable constructed with size: {}".format(self.q_table.size))
        
    def discretize(self, state):
        upper_bounds = [self.state_high[0], 0.5, self.state_high[2], math.radians(50)]
        lower_bounds = [self.state_low[0], -0.5, self.state_low[2], -math.radians(50)]
        ratios = [(state[i] + abs(lower_bounds[i])) / (upper_bounds[i] - lower_bounds[i]) for i in range(len(state))]
        new_obs = [int(round((self.state_size[i] - 1) * ratios[i])) for i in range(len(state))]
        new_obs = [min(self.state_size[i] - 1, max(0, new_obs[i])) for i in range(len(state))]
        return tuple(new_obs)       
        
    def get_action(self, state): #epsilon greedy policy
        state = self.discretize(state)
        q_state = self.q_table[state]
        action_greedy = np.argmax(q_state)
        action_random = super().get_action_random(state)
        return action_random if random.random() < self.eps else action_greedy  
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        state = self.discretize(state)
        next_state = self.discretize(next_state)
        
        q_next = self.q_table[next_state]
        q_next = np.zeros([self.action_size]) if done else q_next
        q_target = reward + self.discount_rate * np.max(q_next)
        #print((*state,action))
        q_update = q_target - self.q_table[(*state,action)]      #Like TD error
        self.q_table[(*state,action)] += self.learning_rate * q_update #Like TD update
        
        if done:
            self.eps = self.eps * 0.99
        
myagent = QLAgent(env)

||Action-Space|| = 2
||State-Space|| range: [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38] [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
QTable constructed with size: 144


In [9]:
n = int(input("Enter the no. of episodes for simulation: "))

total_reward = 0
for ep in range(n):
    state = env.reset()
    for t in range(10000):
        env.render()        
        action = myagent.get_action(state)
        next_state, reward, done, info = env.step(action)     
        
        myagent.train((state,action,next_state,reward,done))
        state = next_state
        total_reward += reward
        
        #print("In State: ", state)   
        #print("In State (Discrete): ", myagent.discretize(state), "Action: ", action)       
        #print("Episode: {}, Total reward: {}, eps: {}".format(ep,total_reward,myagent.eps))
        #print(myagent.q_table)
        #time.sleep(0.05)
        #clear_output(wait=True)
        
        if done:
            print("Episode {} done in {} timesteps! Total reward: {}, eps: {}\n".format(ep+1, t+1, total_reward, myagent.eps))
            break
    else:
        print("Episode {} NOT done in limit of {} timesteps! Agent Terminated!\n".format(ep+1, t+1))  
    
env.close()   

Enter the no. of episodes for simulation: 100
Episode 1 done in 8 timesteps! Total reward: 8.0, eps: 0.36237201786049694

Episode 2 done in 45 timesteps! Total reward: 53.0, eps: 0.358748297681892

Episode 3 done in 34 timesteps! Total reward: 87.0, eps: 0.35516081470507305

Episode 4 done in 15 timesteps! Total reward: 102.0, eps: 0.3516092065580223

Episode 5 done in 18 timesteps! Total reward: 120.0, eps: 0.34809311449244207

Episode 6 done in 43 timesteps! Total reward: 163.0, eps: 0.34461218334751764

Episode 7 done in 21 timesteps! Total reward: 184.0, eps: 0.34116606151404244

Episode 8 done in 16 timesteps! Total reward: 200.0, eps: 0.337754400898902

Episode 9 done in 31 timesteps! Total reward: 231.0, eps: 0.334376856889913

Episode 10 done in 8 timesteps! Total reward: 239.0, eps: 0.33103308832101386

Episode 11 done in 11 timesteps! Total reward: 250.0, eps: 0.3277227574378037

Episode 12 done in 13 timesteps! Total reward: 263.0, eps: 0.3244455298634257

Episode 13 done in

In [8]:
print(myagent.q_table)

[[[[[0.         0.        ]
    [0.         0.        ]
    [0.         0.        ]
    [0.         0.        ]
    [0.         0.        ]
    [0.         0.        ]
    [0.         0.        ]
    [0.         0.        ]
    [0.         0.        ]
    [0.         0.        ]
    [0.         0.        ]
    [0.         0.        ]]

   [[0.15234051 0.25087245]
    [0.08871926 0.02336006]
    [0.01       0.03233699]
    [0.07782855 0.01      ]
    [0.02028694 0.02009768]
    [0.04968298 0.010097  ]
    [0.03027899 0.        ]
    [0.03969111 0.01038689]
    [0.         0.03076871]
    [0.02001517 0.        ]
    [0.02450818 0.        ]
    [0.         0.        ]]

   [[0.96848375 0.55191832]
    [0.20574224 0.62882707]
    [0.18770265 0.60231248]
    [0.24004139 0.40248881]
    [0.30558492 0.15055859]
    [0.90042061 0.40358814]
    [0.40604021 0.22432   ]
    [0.36279301 0.86574965]
    [0.11825322 0.04884841]
    [0.10127722 0.30745696]
    [0.08603051 0.02323985]
    [0.35701015 

In [10]:
print(myagent.q_table)

[[[[[0.         0.        ]
    [0.         0.        ]
    [0.         0.        ]
    [0.         0.        ]
    [0.         0.        ]
    [0.         0.        ]
    [0.         0.        ]
    [0.         0.        ]
    [0.         0.        ]
    [0.         0.        ]
    [0.         0.        ]
    [0.         0.        ]]

   [[0.17751685 0.45650439]
    [0.13652711 0.02336006]
    [0.01       0.07219514]
    [0.11512372 0.02113971]
    [0.03037778 0.03033201]
    [0.04968298 0.02093554]
    [0.03027899 0.0108463 ]
    [0.03969111 0.01038689]
    [0.         0.03076871]
    [0.02001517 0.        ]
    [0.02450818 0.        ]
    [0.         0.        ]]

   [[1.428104   0.67089683]
    [0.28537093 1.17610319]
    [0.23834355 0.87968056]
    [0.27687906 0.6420026 ]
    [0.55152815 0.17934761]
    [2.19932997 0.59671124]
    [1.02580013 0.46171566]
    [0.56693842 2.24316158]
    [0.30893827 0.10906512]
    [0.17879325 0.57339756]
    [0.27259237 0.03513686]
    [0.52611707 

In [44]:
myagent = QLAgent(env, size=(4, 4, 12, 48,), discount_rate=0.5, learning_rate=0.02)

n = int(input("Enter the no. of episodes for simulation: "))

total_reward = 0
for ep in range(n):
    state = env.reset()
    for t in range(10000):
        env.render()        
        action = myagent.get_action(state)
        next_state, reward, done, info = env.step(action)     
        
        myagent.train((state,action,next_state,reward,done))
        state = next_state
        total_reward += reward
        
        #print("In State: ", state)   
        #print("In State (Discrete): ", myagent.discretize(state), "Action: ", action)       
        #print("Episode: {}, Total reward: {}, eps: {}".format(ep,total_reward,myagent.eps))
        #print(myagent.q_table)
        #time.sleep(0.05)
        #clear_output(wait=True)
        
        if done:
            print("Episode {} done in {} timesteps! Total reward: {}, eps: {}\n".format(ep+1, t+1, total_reward, myagent.eps))
            break
    else:
        print("Episode {} NOT done in limit of {} timesteps! Agent Terminated!\n".format(ep+1, t+1))  
    
env.close()   

||Action-Space|| = 2
||State-Space|| range: [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38] [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
QTable constructed with size: 18432
Enter the no. of episodes for simulation: 1000
Episode 1 done in 15 timesteps! Total reward: 15.0, eps: 0.99

Episode 2 done in 26 timesteps! Total reward: 41.0, eps: 0.9801

Episode 3 done in 12 timesteps! Total reward: 53.0, eps: 0.9702989999999999

Episode 4 done in 23 timesteps! Total reward: 76.0, eps: 0.96059601

Episode 5 done in 14 timesteps! Total reward: 90.0, eps: 0.9509900498999999

Episode 6 done in 10 timesteps! Total reward: 100.0, eps: 0.9414801494009999

Episode 7 done in 23 timesteps! Total reward: 123.0, eps: 0.9320653479069899

Episode 8 done in 30 timesteps! Total reward: 153.0, eps: 0.92274469442792

Episode 9 done in 11 timesteps! Total reward: 164.0, eps: 0.9135172474836407

Episode 10 done in 20 timesteps! Total reward: 184.0, eps: 0.9043820750088043

Episode 11 do

Episode 102 done in 14 timesteps! Total reward: 2185.0, eps: 0.358748297681892

Episode 103 done in 35 timesteps! Total reward: 2220.0, eps: 0.35516081470507305

Episode 104 done in 25 timesteps! Total reward: 2245.0, eps: 0.3516092065580223

Episode 105 done in 22 timesteps! Total reward: 2267.0, eps: 0.34809311449244207

Episode 106 done in 16 timesteps! Total reward: 2283.0, eps: 0.34461218334751764

Episode 107 done in 19 timesteps! Total reward: 2302.0, eps: 0.34116606151404244

Episode 108 done in 46 timesteps! Total reward: 2348.0, eps: 0.337754400898902

Episode 109 done in 16 timesteps! Total reward: 2364.0, eps: 0.334376856889913

Episode 110 done in 12 timesteps! Total reward: 2376.0, eps: 0.33103308832101386

Episode 111 done in 16 timesteps! Total reward: 2392.0, eps: 0.3277227574378037

Episode 112 done in 28 timesteps! Total reward: 2420.0, eps: 0.3244455298634257

Episode 113 done in 17 timesteps! Total reward: 2437.0, eps: 0.3212010745647914

Episode 114 done in 16 tim

Episode 204 done in 9 timesteps! Total reward: 4110.0, eps: 0.12870034108965536

Episode 205 done in 32 timesteps! Total reward: 4142.0, eps: 0.12741333767875881

Episode 206 done in 20 timesteps! Total reward: 4162.0, eps: 0.12613920430197123

Episode 207 done in 26 timesteps! Total reward: 4188.0, eps: 0.12487781225895152

Episode 208 done in 13 timesteps! Total reward: 4201.0, eps: 0.123629034136362

Episode 209 done in 33 timesteps! Total reward: 4234.0, eps: 0.12239274379499838

Episode 210 done in 16 timesteps! Total reward: 4250.0, eps: 0.1211688163570484

Episode 211 done in 36 timesteps! Total reward: 4286.0, eps: 0.11995712819347792

Episode 212 done in 26 timesteps! Total reward: 4312.0, eps: 0.11875755691154315

Episode 213 done in 30 timesteps! Total reward: 4342.0, eps: 0.11756998134242772

Episode 214 done in 9 timesteps! Total reward: 4351.0, eps: 0.11639428152900344

Episode 215 done in 50 timesteps! Total reward: 4401.0, eps: 0.11523033871371341

Episode 216 done in 1

Episode 305 done in 36 timesteps! Total reward: 6284.0, eps: 0.04663740229999265

Episode 306 done in 9 timesteps! Total reward: 6293.0, eps: 0.04617102827699272

Episode 307 done in 11 timesteps! Total reward: 6304.0, eps: 0.045709317994222794

Episode 308 done in 17 timesteps! Total reward: 6321.0, eps: 0.04525222481428057

Episode 309 done in 16 timesteps! Total reward: 6337.0, eps: 0.04479970256613776

Episode 310 done in 12 timesteps! Total reward: 6349.0, eps: 0.04435170554047638

Episode 311 done in 9 timesteps! Total reward: 6358.0, eps: 0.043908188485071616

Episode 312 done in 24 timesteps! Total reward: 6382.0, eps: 0.0434691066002209

Episode 313 done in 21 timesteps! Total reward: 6403.0, eps: 0.04303441553421869

Episode 314 done in 12 timesteps! Total reward: 6415.0, eps: 0.0426040713788765

Episode 315 done in 17 timesteps! Total reward: 6432.0, eps: 0.04217803066508773

Episode 316 done in 26 timesteps! Total reward: 6458.0, eps: 0.04175625035843686

Episode 317 done i

Episode 405 done in 14 timesteps! Total reward: 8248.0, eps: 0.017070797554767782

Episode 406 done in 41 timesteps! Total reward: 8289.0, eps: 0.016900089579220106

Episode 407 done in 13 timesteps! Total reward: 8302.0, eps: 0.016731088683427906

Episode 408 done in 13 timesteps! Total reward: 8315.0, eps: 0.016563777796593626

Episode 409 done in 18 timesteps! Total reward: 8333.0, eps: 0.016398140018627688

Episode 410 done in 28 timesteps! Total reward: 8361.0, eps: 0.01623415861844141

Episode 411 done in 19 timesteps! Total reward: 8380.0, eps: 0.016071817032256998

Episode 412 done in 13 timesteps! Total reward: 8393.0, eps: 0.01591109886193443

Episode 413 done in 12 timesteps! Total reward: 8405.0, eps: 0.015751987873315085

Episode 414 done in 27 timesteps! Total reward: 8432.0, eps: 0.015594467994581935

Episode 415 done in 23 timesteps! Total reward: 8455.0, eps: 0.015438523314636115

Episode 416 done in 22 timesteps! Total reward: 8477.0, eps: 0.015284138081489753

Episod

Episode 504 done in 26 timesteps! Total reward: 10303.0, eps: 0.00631157979431613

Episode 505 done in 26 timesteps! Total reward: 10329.0, eps: 0.006248463996372969

Episode 506 done in 13 timesteps! Total reward: 10342.0, eps: 0.006185979356409239

Episode 507 done in 10 timesteps! Total reward: 10352.0, eps: 0.006124119562845147

Episode 508 done in 23 timesteps! Total reward: 10375.0, eps: 0.006062878367216695

Episode 509 done in 36 timesteps! Total reward: 10411.0, eps: 0.006002249583544528

Episode 510 done in 45 timesteps! Total reward: 10456.0, eps: 0.005942227087709083

Episode 511 done in 13 timesteps! Total reward: 10469.0, eps: 0.005882804816831992

Episode 512 done in 20 timesteps! Total reward: 10489.0, eps: 0.005823976768663672

Episode 513 done in 33 timesteps! Total reward: 10522.0, eps: 0.005765737000977035

Episode 514 done in 30 timesteps! Total reward: 10552.0, eps: 0.005708079630967265

Episode 515 done in 12 timesteps! Total reward: 10564.0, eps: 0.0056509988346

Episode 602 done in 24 timesteps! Total reward: 12267.0, eps: 0.0023571496064139765

Episode 603 done in 14 timesteps! Total reward: 12281.0, eps: 0.0023335781103498367

Episode 604 done in 13 timesteps! Total reward: 12294.0, eps: 0.002310242329246338

Episode 605 done in 11 timesteps! Total reward: 12305.0, eps: 0.002287139905953875

Episode 606 done in 11 timesteps! Total reward: 12316.0, eps: 0.0022642685068943362

Episode 607 done in 13 timesteps! Total reward: 12329.0, eps: 0.002241625821825393

Episode 608 done in 9 timesteps! Total reward: 12338.0, eps: 0.002219209563607139

Episode 609 done in 26 timesteps! Total reward: 12364.0, eps: 0.0021970174679710676

Episode 610 done in 20 timesteps! Total reward: 12384.0, eps: 0.0021750472932913567

Episode 611 done in 14 timesteps! Total reward: 12398.0, eps: 0.0021532968203584434

Episode 612 done in 12 timesteps! Total reward: 12410.0, eps: 0.002131763852154859

Episode 613 done in 11 timesteps! Total reward: 12421.0, eps: 0.0021104

Episode 699 done in 16 timesteps! Total reward: 14085.0, eps: 0.0008892032138206666

Episode 700 done in 26 timesteps! Total reward: 14111.0, eps: 0.0008803111816824599

Episode 701 done in 16 timesteps! Total reward: 14127.0, eps: 0.0008715080698656353

Episode 702 done in 11 timesteps! Total reward: 14138.0, eps: 0.0008627929891669789

Episode 703 done in 13 timesteps! Total reward: 14151.0, eps: 0.0008541650592753091

Episode 704 done in 9 timesteps! Total reward: 14160.0, eps: 0.000845623408682556

Episode 705 done in 16 timesteps! Total reward: 14176.0, eps: 0.0008371671745957304

Episode 706 done in 15 timesteps! Total reward: 14191.0, eps: 0.0008287955028497731

Episode 707 done in 17 timesteps! Total reward: 14208.0, eps: 0.0008205075478212754

Episode 708 done in 15 timesteps! Total reward: 14223.0, eps: 0.0008123024723430627

Episode 709 done in 12 timesteps! Total reward: 14235.0, eps: 0.000804179447619632

Episode 710 done in 29 timesteps! Total reward: 14264.0, eps: 0.0007

Episode 796 done in 36 timesteps! Total reward: 15984.0, eps: 0.0003354400388153118

Episode 797 done in 28 timesteps! Total reward: 16012.0, eps: 0.00033208563842715866

Episode 798 done in 14 timesteps! Total reward: 16026.0, eps: 0.0003287647820428871

Episode 799 done in 21 timesteps! Total reward: 16047.0, eps: 0.00032547713422245825

Episode 800 done in 14 timesteps! Total reward: 16061.0, eps: 0.00032222236288023367

Episode 801 done in 10 timesteps! Total reward: 16071.0, eps: 0.00031900013925143135

Episode 802 done in 13 timesteps! Total reward: 16084.0, eps: 0.000315810137858917

Episode 803 done in 14 timesteps! Total reward: 16098.0, eps: 0.00031265203648032783

Episode 804 done in 11 timesteps! Total reward: 16109.0, eps: 0.00030952551611552456

Episode 805 done in 23 timesteps! Total reward: 16132.0, eps: 0.00030643026095436934

Episode 806 done in 16 timesteps! Total reward: 16148.0, eps: 0.00030336595834482564

Episode 807 done in 28 timesteps! Total reward: 16176.0, e

Episode 892 done in 16 timesteps! Total reward: 17788.0, eps: 0.0001278184600874529

Episode 893 done in 21 timesteps! Total reward: 17809.0, eps: 0.00012654027548657836

Episode 894 done in 18 timesteps! Total reward: 17827.0, eps: 0.0001252748727317126

Episode 895 done in 13 timesteps! Total reward: 17840.0, eps: 0.00012402212400439546

Episode 896 done in 27 timesteps! Total reward: 17867.0, eps: 0.0001227819027643515

Episode 897 done in 14 timesteps! Total reward: 17881.0, eps: 0.00012155408373670799

Episode 898 done in 9 timesteps! Total reward: 17890.0, eps: 0.0001203385428993409

Episode 899 done in 12 timesteps! Total reward: 17902.0, eps: 0.0001191351574703475

Episode 900 done in 9 timesteps! Total reward: 17911.0, eps: 0.00011794380589564402

Episode 901 done in 17 timesteps! Total reward: 17928.0, eps: 0.00011676436783668758

Episode 902 done in 26 timesteps! Total reward: 17954.0, eps: 0.0001155967241583207

Episode 903 done in 27 timesteps! Total reward: 17981.0, eps: 

Episode 989 done in 20 timesteps! Total reward: 19736.0, eps: 4.821780729831644e-05

Episode 990 done in 13 timesteps! Total reward: 19749.0, eps: 4.773562922533328e-05

Episode 991 done in 24 timesteps! Total reward: 19773.0, eps: 4.7258272933079946e-05

Episode 992 done in 25 timesteps! Total reward: 19798.0, eps: 4.678569020374915e-05

Episode 993 done in 9 timesteps! Total reward: 19807.0, eps: 4.6317833301711654e-05

Episode 994 done in 18 timesteps! Total reward: 19825.0, eps: 4.585465496869454e-05

Episode 995 done in 27 timesteps! Total reward: 19852.0, eps: 4.5396108419007594e-05

Episode 996 done in 9 timesteps! Total reward: 19861.0, eps: 4.494214733481752e-05

Episode 997 done in 14 timesteps! Total reward: 19875.0, eps: 4.449272586146935e-05

Episode 998 done in 14 timesteps! Total reward: 19889.0, eps: 4.404779860285465e-05

Episode 999 done in 27 timesteps! Total reward: 19916.0, eps: 4.36073206168261e-05

Episode 1000 done in 53 timesteps! Total reward: 19969.0, eps: 4.