## Import Packages
* gym - collection of environments for reinforcement learning algorithms.
* numpy - package for scientific computing with Python. 
* random - implements pseudo-random number generators for various distributions.
* math -  mathematical functions defined by the C standard.

In [1]:
import gym
import numpy as np
import math

## Define the environment
A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The system is controlled by applying a force of +1 or -1 to the cart. The pole starts upright, and the goal is to prevent it from falling over. A reward of +1 is provided for every timestep that the pole remains upright. The episode ends when the pole is more than 15 degrees from vertical, or the cart moves more than 2.4 units from the center.


In [2]:
env = gym.make('CartPole-v0')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


## Define the environment related constants.
*  Every environment comes with an action_space and an observation_space. These attributes are of type Space, and they describe the format of valid actions and observations. The Discrete space allows a fixed range of non-negative numbers, so in this case valid actions are either 0 or 1. The Box space represents an n-dimensional box, so valid observations will be an array of 4 numbers.
* Define the following constants:
   * Number of discrete states (bucket) per state dimension :position,velocity,angle between the pole and the vertical axis and the angular velocity. 
   * Number of discrete actions (left, right) 
   * Bounds for each discrete state
* Define the minimum exploration and learning rate values.
   

In [3]:
env.action_space.n

2

In [4]:
env.observation_space

Box(4,)

In [5]:
env.observation_space.low 

array([-4.8000002e+00, -3.4028235e+38, -4.1887903e-01, -3.4028235e+38],
      dtype=float32)

In [6]:
env.observation_space.high

array([4.8000002e+00, 3.4028235e+38, 4.1887903e-01, 3.4028235e+38],
      dtype=float32)

In [7]:
NUM_BUCKETS = (1, 1, 6, 3)

In [8]:
NUM_ACTIONS = env.action_space.n

In [9]:
STATE_BOUNDS = list(zip(env.observation_space.low, env.observation_space.high))

In [10]:
STATE_BOUNDS[1] = [-0.5, 0.5]

In [11]:
STATE_BOUNDS[3] = [-math.radians(50), math.radians(50)]

In [12]:
STATE_BOUNDS

[(-4.8, 4.8),
 [-0.5, 0.5],
 (-0.41887903, 0.41887903),
 [-0.8726646259971648, 0.8726646259971648]]

## Create Q-Table
* Create a Q-Table for each state-action pair
* Define the learning related constants:
* The learning rate must decay but not too fast. The conditions for convergence are the following:
    *sum(alpha(t), 1, inf) = inf
    *sum(alpha(t)^2, 1, inf) < inf
    * Something like alpha = k/(k+t) can work well.

In [15]:
q_table = np.zeros(NUM_BUCKETS + (NUM_ACTIONS,))

In [16]:
q_table.shape

(1, 1, 6, 3, 2)

In [17]:
print(q_table)

[[[[[0. 0.]
    [0. 0.]
    [0. 0.]]

   [[0. 0.]
    [0. 0.]
    [0. 0.]]

   [[0. 0.]
    [0. 0.]
    [0. 0.]]

   [[0. 0.]
    [0. 0.]
    [0. 0.]]

   [[0. 0.]
    [0. 0.]
    [0. 0.]]

   [[0. 0.]
    [0. 0.]
    [0. 0.]]]]]


In [18]:
EXPLORE_RATE_MIN = 0.01

In [19]:
LEARNING_RATE_MIN = 0.1

In [20]:
def get_explore_rate(t):
    return max(EXPLORE_RATE_MIN, min(1, 1.0 - math.log10((t+1)/25)))

In [21]:
def get_learning_rate(t):
    return max(LEARNING_RATE_MIN, min(0.5, 1.0 - math.log10((t+1)/25)))

## Define method for selecting an action
* Select a random action or the action with the highest q based on explore rate

In [22]:
def select_action(state, explore_rate):
    if random.random() < explore_rate:
        action = env.action_space.sample()
    else:
        action = np.argmax(q_table[state])
    return action

## Define method to define states
* Discretize the continuous dimensions to a number of buckets. 
* Map the state bounds to the bucket array

In [18]:
def state_to_bucket(state):
    
    bucket_indices = []
    
    for i in range(len(state)):
        if state[i] <= STATE_BOUNDS[i][0]:
            bucket_index = 0
            
        elif state[i] >= STATE_BOUNDS[i][1]:
            bucket_index = NUM_BUCKETS[i] - 1
            
        else:
            bound_width = STATE_BOUNDS[i][1] - STATE_BOUNDS[i][0]
            
            offset = (NUM_BUCKETS[i] - 1) * STATE_BOUNDS[i][0] / bound_width
            scaling = (NUM_BUCKETS[i] - 1) / bound_width
            
            bucket_index = int(round(scaling * state[i] - offset))
            
        bucket_indices.append(bucket_index)
    return tuple(bucket_indices)

## Define method for simulation
* Get the initial learning and explore rates
* Intialise discount factor and number of streaks
* Train over 1000 episodes. In each episode, do the following:
* Start the process using the reset method
* Get the initial state by passing in the current observations about the environment
* Iterate over 250 timesteps:
    * Invoke the render method to visualise the episode.
    * Execute a selected action using the step emthod and read the values of reward, observation, done and info.
    * Identify the resultant state from the observations.
    * Identify the state with the highest possible reward from the q table.
    * Compute the q value and update the q table : Q[s, a] = Q[s, a] + alpha*(R + gamma*Max[Q(s’, A)] - Q[s, a])
    
    * Set the new state as current state.
    * If done equals true, episode has terminated.Reset the environment again.
    * The agent wouldn't need more than 200 timesteps to train. Increment the no_streaks value so we can terminate training in 120 episodes, each of no more than 200 timesteps.
    * Update the parameters so the learning and exploration rates decay as the episodes increase
    

In [227]:
def simulate():

    learning_rate = get_learning_rate(0)
    explore_rate = get_explore_rate(0)

    discount_factor = 0.99  
    num_streaks = 0

    for episode in range(1000):
        
        observ = env.reset()
    
        state_0 = state_to_bucket(observ)

        for t in range(250):

            env.render()
            
            action = select_action(state_0, explore_rate)
            
            observ, reward, done, _ = env.step(action)

            state = state_to_bucket(observ)
            
            best_q = np.amax(q_table[state])
            
            q_table[state_0 + (action,)] +=\
                learning_rate * (reward + discount_factor*(best_q) - q_table[state_0 + (action,)])

            state_0 = state

            print("\nEpisode = %d" % episode)
            print("t = %d" % t)
            print("Action: %d" % action)
            print("State: %s" % str(state))
            print("Reward: %f" % reward)
            print("Best Q: %f" % best_q)
            print("Explore rate: %f" % explore_rate)
            print("Learning rate: %f" % learning_rate)
            print("Streaks: %d" % num_streaks)

            print("")

            if done:
                print("Episode %d finished after %f time steps" % (episode, t))
                if (t >= 199):
                    num_streaks += 1
                else:
                    num_streaks = 0
                break

        if num_streaks > 120:
            break
      
        explore_rate = get_explore_rate(episode)
        learning_rate = get_learning_rate(episode)

## Train the agent
* Invoke the simulate method
* Close the rendered environment.
* Check the values in q talbe.

In [228]:
simulate()


Episode = 0
t = 0
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 0.000000
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 0
t = 1
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 0.500000
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 0
t = 2
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 0.747500
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 0
t = 3
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 1.120013
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 0
t = 4
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 1.428156
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 0
t = 5
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 1.921015
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 0
t = 6
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 2.010909
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episo


Episode = 3
t = 7
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 0.000000
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 3
t = 8
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 3.061669
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 3
t = 9
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 3.061669
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 3
t = 10
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 3.546361
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 3
t = 11
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 4.028629
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 3
t = 12
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 4.028629
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 3
t = 13
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 2.881792
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


E


Episode = 5
t = 16
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 8.306926
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 5
t = 17
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 8.702990
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 5
t = 18
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 8.961443
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 5
t = 19
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 9.287409
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 5
t = 20
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 9.577989
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 5
t = 21
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 9.884809
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 5
t = 22
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 10.335385
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 8
t = 3
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 12.806507
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 8
t = 4
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 13.242474
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 8
t = 5
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 13.455196
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 8
t = 6
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 13.781559
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 8
t = 7
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 7.961003
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 8
t = 8
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 7.961003
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 8
t = 9
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 7.961003
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


E


Episode = 11
t = 2
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 15.493038
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 11
t = 3
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 15.809667
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 11
t = 4
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 16.072304
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 11
t = 5
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 14.186185
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 11
t = 6
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 16.072304
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 11
t = 7
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 16.072304
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 11
t = 8
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 16.491943
Explore rate: 1.000000
Learning rate: 0.500000
Stre


Episode = 13
t = 13
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 4.333542
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0

Episode 13 finished after 13.000000 time steps

Episode = 14
t = 0
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 18.697073
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 14
t = 1
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 14.538442
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 14
t = 2
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 14.538442
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 14
t = 3
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 11.678851
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 14
t = 4
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 11.678851
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 14
t = 5
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 12.120457
Expl


Episode = 16
t = 6
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 20.072073
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 16
t = 7
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 20.072073
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 16
t = 8
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 20.072073
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 16
t = 9
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 20.072073
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 16
t = 10
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 20.471713
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 16
t = 11
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 4.876911
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 16
t = 12
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 4.876911
Explore rate: 1.000000
Learning rate: 0.500000
Str


Episode = 19
t = 18
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 22.981474
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 19
t = 19
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 22.981474
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 19
t = 20
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 23.366567
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 19
t = 21
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 23.366567
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 19
t = 22
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 23.366567
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 19
t = 23
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 23.749734
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 19
t = 24
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 6.537642
Explore rate: 1.000000
Learning rate: 0.50000


Episode = 22
t = 11
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 18.267279
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 22
t = 12
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 18.267279
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 22
t = 13
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 18.614442
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 22
t = 14
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 4.885836
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 22
t = 15
Action: 1
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 4.885836
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0

Episode 22 finished after 15.000000 time steps

Episode = 23
t = 0
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 24.138047
Explore rate: 1.000000
Learning rate: 0.500000
Streaks: 0


Episode = 23
t = 1
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 24.517356
E


Episode = 27
t = 12
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 24.856909
Explore rate: 0.966576
Learning rate: 0.500000
Streaks: 0


Episode = 27
t = 13
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 25.232624
Explore rate: 0.966576
Learning rate: 0.500000
Streaks: 0


Episode = 27
t = 14
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 25.336366
Explore rate: 0.966576
Learning rate: 0.500000
Streaks: 0


Episode = 27
t = 15
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 25.657814
Explore rate: 0.966576
Learning rate: 0.500000
Streaks: 0


Episode = 27
t = 16
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 25.822617
Explore rate: 0.966576
Learning rate: 0.500000
Streaks: 0


Episode = 27
t = 17
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 26.111102
Explore rate: 0.966576
Learning rate: 0.500000
Streaks: 0


Episode = 27
t = 18
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 26.336304
Explore rate: 0.966576
Learning rate: 0.5000


Episode = 29
t = 39
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 26.359511
Explore rate: 0.935542
Learning rate: 0.500000
Streaks: 0


Episode = 29
t = 40
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 28.641971
Explore rate: 0.935542
Learning rate: 0.500000
Streaks: 0


Episode = 29
t = 41
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 28.641971
Explore rate: 0.935542
Learning rate: 0.500000
Streaks: 0


Episode = 29
t = 42
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 28.934672
Explore rate: 0.935542
Learning rate: 0.500000
Streaks: 0


Episode = 29
t = 43
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 29.289998
Explore rate: 0.935542
Learning rate: 0.500000
Streaks: 0


Episode = 29
t = 44
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 29.319534
Explore rate: 0.935542
Learning rate: 0.500000
Streaks: 0


Episode = 29
t = 45
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 27.857531
Explore rate: 0.935542
Learning rate: 0.5000


Episode = 32
t = 2
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 31.644087
Explore rate: 0.892790
Learning rate: 0.500000
Streaks: 0


Episode = 32
t = 3
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 28.690590
Explore rate: 0.892790
Learning rate: 0.500000
Streaks: 0


Episode = 32
t = 4
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 31.644087
Explore rate: 0.892790
Learning rate: 0.500000
Streaks: 0


Episode = 32
t = 5
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 31.644087
Explore rate: 0.892790
Learning rate: 0.500000
Streaks: 0


Episode = 32
t = 6
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 31.985867
Explore rate: 0.892790
Learning rate: 0.500000
Streaks: 0


Episode = 32
t = 7
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 28.836163
Explore rate: 0.892790
Learning rate: 0.500000
Streaks: 0


Episode = 32
t = 8
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 28.836163
Explore rate: 0.892790
Learning rate: 0.500000
Stre


Episode = 35
t = 3
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 29.932487
Explore rate: 0.853872
Learning rate: 0.500000
Streaks: 0


Episode = 35
t = 4
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 29.932487
Explore rate: 0.853872
Learning rate: 0.500000
Streaks: 0


Episode = 35
t = 5
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 29.932487
Explore rate: 0.853872
Learning rate: 0.500000
Streaks: 0


Episode = 35
t = 6
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 29.932487
Explore rate: 0.853872
Learning rate: 0.500000
Streaks: 0


Episode = 35
t = 7
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 30.282824
Explore rate: 0.853872
Learning rate: 0.500000
Streaks: 0


Episode = 35
t = 8
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 30.631410
Explore rate: 0.853872
Learning rate: 0.500000
Streaks: 0


Episode = 35
t = 9
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 30.978253
Explore rate: 0.853872
Learning rate: 0.500000
Stre


Episode = 38
t = 8
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 30.786765
Explore rate: 0.818156
Learning rate: 0.500000
Streaks: 0


Episode = 38
t = 9
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 31.667338
Explore rate: 0.818156
Learning rate: 0.500000
Streaks: 0


Episode = 38
t = 10
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 31.667338
Explore rate: 0.818156
Learning rate: 0.500000
Streaks: 0


Episode = 38
t = 11
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 31.667338
Explore rate: 0.818156
Learning rate: 0.500000
Streaks: 0


Episode = 38
t = 12
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 32.009001
Explore rate: 0.818156
Learning rate: 0.500000
Streaks: 0


Episode = 38
t = 13
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 32.009001
Explore rate: 0.818156
Learning rate: 0.500000
Streaks: 0


Episode = 38
t = 14
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 32.348956
Explore rate: 0.818156
Learning rate: 0.500000


Episode = 41
t = 6
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 32.809649
Explore rate: 0.785156
Learning rate: 0.500000
Streaks: 0


Episode = 41
t = 7
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 32.809649
Explore rate: 0.785156
Learning rate: 0.500000
Streaks: 0


Episode = 41
t = 8
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 32.648506
Explore rate: 0.785156
Learning rate: 0.500000
Streaks: 0


Episode = 41
t = 9
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 33.145600
Explore rate: 0.785156
Learning rate: 0.500000
Streaks: 0


Episode = 41
t = 10
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 33.145600
Explore rate: 0.785156
Learning rate: 0.500000
Streaks: 0


Episode = 41
t = 11
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 33.479872
Explore rate: 0.785156
Learning rate: 0.500000
Streaks: 0


Episode = 41
t = 12
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 33.479872
Explore rate: 0.785156
Learning rate: 0.500000
S


Episode = 43
t = 25
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 31.568715
Explore rate: 0.764472
Learning rate: 0.500000
Streaks: 0


Episode = 43
t = 26
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 31.568715
Explore rate: 0.764472
Learning rate: 0.500000
Streaks: 0


Episode = 43
t = 27
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 31.568715
Explore rate: 0.764472
Learning rate: 0.500000
Streaks: 0


Episode = 43
t = 28
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 31.844429
Explore rate: 0.764472
Learning rate: 0.500000
Streaks: 0


Episode = 43
t = 29
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 32.185207
Explore rate: 0.764472
Learning rate: 0.500000
Streaks: 0


Episode = 43
t = 30
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 32.524281
Explore rate: 0.764472
Learning rate: 0.500000
Streaks: 0


Episode = 43
t = 31
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 27.511779
Explore rate: 0.764472
Learning rate: 0.5000


Episode = 46
t = 5
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 38.302065
Explore rate: 0.735182
Learning rate: 0.500000
Streaks: 0


Episode = 46
t = 6
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 38.302065
Explore rate: 0.735182
Learning rate: 0.500000
Streaks: 0


Episode = 46
t = 7
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 38.610555
Explore rate: 0.735182
Learning rate: 0.500000
Streaks: 0


Episode = 46
t = 8
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 38.677928
Explore rate: 0.735182
Learning rate: 0.500000
Streaks: 0


Episode = 46
t = 9
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 33.999704
Explore rate: 0.735182
Learning rate: 0.500000
Streaks: 0


Episode = 46
t = 10
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 23.735779
Explore rate: 0.735182
Learning rate: 0.500000
Streaks: 0


Episode = 46
t = 11
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 23.735779
Explore rate: 0.735182
Learning rate: 0.500000
St


Episode = 49
t = 1
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 30.224595
Explore rate: 0.707744
Learning rate: 0.500000
Streaks: 0


Episode = 49
t = 2
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 38.424685
Explore rate: 0.707744
Learning rate: 0.500000
Streaks: 0


Episode = 49
t = 3
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 34.632517
Explore rate: 0.707744
Learning rate: 0.500000
Streaks: 0


Episode = 49
t = 4
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 38.424685
Explore rate: 0.707744
Learning rate: 0.500000
Streaks: 0


Episode = 49
t = 5
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 36.836477
Explore rate: 0.707744
Learning rate: 0.500000
Streaks: 0


Episode = 49
t = 6
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 38.424685
Explore rate: 0.707744
Learning rate: 0.500000
Streaks: 0


Episode = 49
t = 7
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 37.938458
Explore rate: 0.707744
Learning rate: 0.500000
Stre


Episode = 49
t = 64
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 41.961994
Explore rate: 0.707744
Learning rate: 0.500000
Streaks: 0


Episode = 49
t = 65
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 41.961994
Explore rate: 0.707744
Learning rate: 0.500000
Streaks: 0


Episode = 49
t = 66
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 42.252184
Explore rate: 0.707744
Learning rate: 0.500000
Streaks: 0


Episode = 49
t = 67
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 42.252184
Explore rate: 0.707744
Learning rate: 0.500000
Streaks: 0


Episode = 49
t = 68
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 42.434290
Explore rate: 0.707744
Learning rate: 0.500000
Streaks: 0


Episode = 49
t = 69
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 42.775435
Explore rate: 0.707744
Learning rate: 0.500000
Streaks: 0


Episode = 49
t = 70
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 42.805779
Explore rate: 0.707744
Learning rate: 0.5000


Episode = 52
t = 16
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 45.160390
Explore rate: 0.681937
Learning rate: 0.500000
Streaks: 0


Episode = 52
t = 17
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 45.325348
Explore rate: 0.681937
Learning rate: 0.500000
Streaks: 0


Episode = 52
t = 18
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 45.516242
Explore rate: 0.681937
Learning rate: 0.500000
Streaks: 0


Episode = 52
t = 19
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 45.693214
Explore rate: 0.681937
Learning rate: 0.500000
Streaks: 0


Episode = 52
t = 20
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 45.876262
Explore rate: 0.681937
Learning rate: 0.500000
Streaks: 0


Episode = 52
t = 21
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 14.276095
Explore rate: 0.681937
Learning rate: 0.500000
Streaks: 0


Episode = 52
t = 22
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 14.276095
Explore rate: 0.681937
Learning rate: 0.5000


Episode = 55
t = 7
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 43.819645
Explore rate: 0.657577
Learning rate: 0.500000
Streaks: 0


Episode = 55
t = 8
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 44.100547
Explore rate: 0.657577
Learning rate: 0.500000
Streaks: 0


Episode = 55
t = 9
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 44.380045
Explore rate: 0.657577
Learning rate: 0.500000
Streaks: 0


Episode = 55
t = 10
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 44.380045
Explore rate: 0.657577
Learning rate: 0.500000
Streaks: 0


Episode = 55
t = 11
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 15.977757
Explore rate: 0.657577
Learning rate: 0.500000
Streaks: 0


Episode = 55
t = 12
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 15.977757
Explore rate: 0.657577
Learning rate: 0.500000
Streaks: 0

Episode 55 finished after 12.000000 time steps

Episode = 56
t = 0
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 44.023190
E


Episode = 58
t = 2
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 35.921248
Explore rate: 0.634512
Learning rate: 0.500000
Streaks: 0


Episode = 58
t = 3
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 36.241642
Explore rate: 0.634512
Learning rate: 0.500000
Streaks: 0


Episode = 58
t = 4
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 36.241642
Explore rate: 0.634512
Learning rate: 0.500000
Streaks: 0


Episode = 58
t = 5
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 36.560434
Explore rate: 0.634512
Learning rate: 0.500000
Streaks: 0


Episode = 58
t = 6
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 36.877632
Explore rate: 0.634512
Learning rate: 0.500000
Streaks: 0


Episode = 58
t = 7
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 37.193243
Explore rate: 0.634512
Learning rate: 0.500000
Streaks: 0


Episode = 58
t = 8
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 37.507277
Explore rate: 0.634512
Learning rate: 0.500000
Stre


Episode = 60
t = 29
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 45.225122
Explore rate: 0.619789
Learning rate: 0.500000
Streaks: 0


Episode = 60
t = 30
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 45.498996
Explore rate: 0.619789
Learning rate: 0.500000
Streaks: 0


Episode = 60
t = 31
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 45.586004
Explore rate: 0.619789
Learning rate: 0.500000
Streaks: 0


Episode = 60
t = 32
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 45.858074
Explore rate: 0.619789
Learning rate: 0.500000
Streaks: 0


Episode = 60
t = 33
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 45.949245
Explore rate: 0.619789
Learning rate: 0.500000
Streaks: 0


Episode = 60
t = 34
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 46.173913
Explore rate: 0.619789
Learning rate: 0.500000
Streaks: 0


Episode = 60
t = 35
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 46.330709
Explore rate: 0.619789
Learning rate: 0.5000


Episode = 63
t = 24
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 45.281864
Explore rate: 0.598599
Learning rate: 0.500000
Streaks: 0


Episode = 63
t = 25
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 45.276206
Explore rate: 0.598599
Learning rate: 0.500000
Streaks: 0


Episode = 63
t = 26
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 45.276206
Explore rate: 0.598599
Learning rate: 0.500000
Streaks: 0


Episode = 63
t = 27
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 45.552654
Explore rate: 0.598599
Learning rate: 0.500000
Streaks: 0


Episode = 63
t = 28
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 45.823476
Explore rate: 0.598599
Learning rate: 0.500000
Streaks: 0


Episode = 63
t = 29
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 45.958948
Explore rate: 0.598599
Learning rate: 0.500000
Streaks: 0


Episode = 63
t = 30
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 46.161417
Explore rate: 0.598599
Learning rate: 0.5000


Episode = 68
t = 2
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 46.520656
Explore rate: 0.565431
Learning rate: 0.500000
Streaks: 0


Episode = 68
t = 3
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 46.788053
Explore rate: 0.565431
Learning rate: 0.500000
Streaks: 0


Episode = 68
t = 4
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 47.054113
Explore rate: 0.565431
Learning rate: 0.500000
Streaks: 0


Episode = 68
t = 5
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 47.054113
Explore rate: 0.565431
Learning rate: 0.500000
Streaks: 0


Episode = 68
t = 6
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 47.054113
Explore rate: 0.565431
Learning rate: 0.500000
Streaks: 0


Episode = 68
t = 7
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 47.318842
Explore rate: 0.565431
Learning rate: 0.500000
Streaks: 0


Episode = 68
t = 8
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 47.582248
Explore rate: 0.565431
Learning rate: 0.500000
Stre


Episode = 71
t = 2
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 48.366346
Explore rate: 0.546682
Learning rate: 0.500000
Streaks: 0


Episode = 71
t = 3
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 48.443652
Explore rate: 0.546682
Learning rate: 0.500000
Streaks: 0


Episode = 71
t = 4
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 49.162253
Explore rate: 0.546682
Learning rate: 0.500000
Streaks: 0


Episode = 71
t = 5
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 49.162253
Explore rate: 0.546682
Learning rate: 0.500000
Streaks: 0


Episode = 71
t = 6
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 49.162253
Explore rate: 0.546682
Learning rate: 0.500000
Streaks: 0


Episode = 71
t = 7
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 49.416442
Explore rate: 0.546682
Learning rate: 0.500000
Streaks: 0


Episode = 71
t = 8
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 49.416442
Explore rate: 0.546682
Learning rate: 0.500000
Stre


Episode = 74
t = 18
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 22.873614
Explore rate: 0.528708
Learning rate: 0.500000
Streaks: 0


Episode = 74
t = 19
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 22.873614
Explore rate: 0.528708
Learning rate: 0.500000
Streaks: 0

Episode 74 finished after 19.000000 time steps

Episode = 75
t = 0
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 50.553636
Explore rate: 0.522879
Learning rate: 0.500000
Streaks: 0


Episode = 75
t = 1
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 50.553636
Explore rate: 0.522879
Learning rate: 0.500000
Streaks: 0


Episode = 75
t = 2
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 50.800868
Explore rate: 0.522879
Learning rate: 0.500000
Streaks: 0


Episode = 75
t = 3
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 51.046864
Explore rate: 0.522879
Learning rate: 0.500000
Streaks: 0


Episode = 75
t = 4
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 51.046864
Ex


Episode = 77
t = 20
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 48.309203
Explore rate: 0.511449
Learning rate: 0.500000
Streaks: 0


Episode = 77
t = 21
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 48.264647
Explore rate: 0.511449
Learning rate: 0.500000
Streaks: 0


Episode = 77
t = 22
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 48.674829
Explore rate: 0.511449
Learning rate: 0.500000
Streaks: 0


Episode = 77
t = 23
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 23.642950
Explore rate: 0.511449
Learning rate: 0.500000
Streaks: 0


Episode = 77
t = 24
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 23.642950
Explore rate: 0.511449
Learning rate: 0.500000
Streaks: 0


Episode = 77
t = 25
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 24.024735
Explore rate: 0.511449
Learning rate: 0.500000
Streaks: 0


Episode = 77
t = 26
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 24.404611
Explore rate: 0.511449
Learning rate: 0.5000


Episode = 80
t = 29
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 25.158675
Explore rate: 0.494850
Learning rate: 0.494850
Streaks: 0


Episode = 80
t = 30
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 25.158675
Explore rate: 0.494850
Learning rate: 0.494850
Streaks: 0

Episode 80 finished after 30.000000 time steps

Episode = 81
t = 0
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 47.789091
Explore rate: 0.489455
Learning rate: 0.489455
Streaks: 0


Episode = 81
t = 1
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 43.326147
Explore rate: 0.489455
Learning rate: 0.489455
Streaks: 0


Episode = 81
t = 2
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 43.326147
Explore rate: 0.489455
Learning rate: 0.489455
Streaks: 0


Episode = 81
t = 3
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 44.879187
Explore rate: 0.489455
Learning rate: 0.489455
Streaks: 0


Episode = 81
t = 4
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 49.883692
Ex


Episode = 84
t = 17
Action: 0
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 26.248300
Explore rate: 0.473661
Learning rate: 0.473661
Streaks: 0


Episode = 84
t = 18
Action: 0
State: (0, 0, 1, 1)
Reward: 1.000000
Best Q: 9.655074
Explore rate: 0.473661
Learning rate: 0.473661
Streaks: 0


Episode = 84
t = 19
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 26.248300
Explore rate: 0.473661
Learning rate: 0.473661
Streaks: 0


Episode = 84
t = 20
Action: 0
State: (0, 0, 1, 1)
Reward: 1.000000
Best Q: 17.863966
Explore rate: 0.473661
Learning rate: 0.473661
Streaks: 0


Episode = 84
t = 21
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 26.248300
Explore rate: 0.473661
Learning rate: 0.473661
Streaks: 0


Episode = 84
t = 22
Action: 1
State: (0, 0, 1, 0)
Reward: 1.000000
Best Q: 26.248300
Explore rate: 0.473661
Learning rate: 0.473661
Streaks: 0

Episode 84 finished after 22.000000 time steps

Episode = 85
t = 0
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 46.897073


Episode = 86
t = 42
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 48.029186
Explore rate: 0.463442
Learning rate: 0.463442
Streaks: 0


Episode = 86
t = 43
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 48.029186
Explore rate: 0.463442
Learning rate: 0.463442
Streaks: 0


Episode = 86
t = 44
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 47.600181
Explore rate: 0.463442
Learning rate: 0.463442
Streaks: 0


Episode = 86
t = 45
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 47.600181
Explore rate: 0.463442
Learning rate: 0.463442
Streaks: 0


Episode = 86
t = 46
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 48.202442
Explore rate: 0.463442
Learning rate: 0.463442
Streaks: 0


Episode = 86
t = 47
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 48.523940
Explore rate: 0.463442
Learning rate: 0.463442
Streaks: 0


Episode = 86
t = 48
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 49.443901
Explore rate: 0.463442
Learning rate: 0.4634


Episode = 87
t = 29
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 52.245352
Explore rate: 0.458421
Learning rate: 0.458421
Streaks: 0


Episode = 87
t = 30
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 52.245352
Explore rate: 0.458421
Learning rate: 0.458421
Streaks: 0


Episode = 87
t = 31
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 52.464269
Explore rate: 0.458421
Learning rate: 0.458421
Streaks: 0


Episode = 87
t = 32
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 49.755070
Explore rate: 0.458421
Learning rate: 0.458421
Streaks: 0


Episode = 87
t = 33
Action: 1
State: (0, 0, 4, 1)
Reward: 1.000000
Best Q: 14.025780
Explore rate: 0.458421
Learning rate: 0.458421
Streaks: 0


Episode = 87
t = 34
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 17.917582
Explore rate: 0.458421
Learning rate: 0.458421
Streaks: 0


Episode = 87
t = 35
Action: 1
State: (0, 0, 4, 1)
Reward: 1.000000
Best Q: 16.186146
Explore rate: 0.458421
Learning rate: 0.4584


Episode = 91
t = 8
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 46.293944
Explore rate: 0.438899
Learning rate: 0.438899
Streaks: 0


Episode = 91
t = 9
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 46.293944
Explore rate: 0.438899
Learning rate: 0.438899
Streaks: 0


Episode = 91
t = 10
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 47.998500
Explore rate: 0.438899
Learning rate: 0.438899
Streaks: 0


Episode = 91
t = 11
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 47.402565
Explore rate: 0.438899
Learning rate: 0.438899
Streaks: 0


Episode = 91
t = 12
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 47.402565
Explore rate: 0.438899
Learning rate: 0.438899
Streaks: 0


Episode = 91
t = 13
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 47.402565
Explore rate: 0.438899
Learning rate: 0.438899
Streaks: 0


Episode = 91
t = 14
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 47.633415
Explore rate: 0.438899
Learning rate: 0.438899


Episode = 96
t = 0
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 46.547492
Explore rate: 0.415669
Learning rate: 0.415669
Streaks: 0


Episode = 96
t = 1
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 46.293997
Explore rate: 0.415669
Learning rate: 0.415669
Streaks: 0


Episode = 96
t = 2
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 46.795191
Explore rate: 0.415669
Learning rate: 0.415669
Streaks: 0


Episode = 96
t = 3
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 46.293997
Explore rate: 0.415669
Learning rate: 0.415669
Streaks: 0


Episode = 96
t = 4
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 46.810099
Explore rate: 0.415669
Learning rate: 0.415669
Streaks: 0


Episode = 96
t = 5
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 46.810099
Explore rate: 0.415669
Learning rate: 0.415669
Streaks: 0


Episode = 96
t = 6
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 46.810099
Explore rate: 0.415669
Learning rate: 0.415669
Stre


Episode = 96
t = 63
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 47.270917
Explore rate: 0.415669
Learning rate: 0.415669
Streaks: 0


Episode = 96
t = 64
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 47.490095
Explore rate: 0.415669
Learning rate: 0.415669
Streaks: 0


Episode = 96
t = 65
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 47.708362
Explore rate: 0.415669
Learning rate: 0.415669
Streaks: 0


Episode = 96
t = 66
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 47.708362
Explore rate: 0.415669
Learning rate: 0.415669
Streaks: 0


Episode = 96
t = 67
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 47.741318
Explore rate: 0.415669
Learning rate: 0.415669
Streaks: 0


Episode = 96
t = 68
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 48.066294
Explore rate: 0.415669
Learning rate: 0.415669
Streaks: 0


Episode = 96
t = 69
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 48.092273
Explore rate: 0.415669
Learning rate: 0.4156


Episode = 98
t = 24
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 45.673898
Explore rate: 0.406714
Learning rate: 0.406714
Streaks: 0


Episode = 98
t = 25
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 45.673898
Explore rate: 0.406714
Learning rate: 0.406714
Streaks: 0


Episode = 98
t = 26
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 46.883486
Explore rate: 0.406714
Learning rate: 0.406714
Streaks: 0


Episode = 98
t = 27
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 46.883486
Explore rate: 0.406714
Learning rate: 0.406714
Streaks: 0


Episode = 98
t = 28
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 46.921934
Explore rate: 0.406714
Learning rate: 0.406714
Streaks: 0


Episode = 98
t = 29
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 47.137810
Explore rate: 0.406714
Learning rate: 0.406714
Streaks: 0


Episode = 98
t = 30
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 49.760370
Explore rate: 0.406714
Learning rate: 0.4067


Episode = 98
t = 87
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 51.474253
Explore rate: 0.406714
Learning rate: 0.406714
Streaks: 0


Episode = 98
t = 88
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 51.474253
Explore rate: 0.406714
Learning rate: 0.406714
Streaks: 0


Episode = 98
t = 89
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 51.671614
Explore rate: 0.406714
Learning rate: 0.406714
Streaks: 0


Episode = 98
t = 90
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 51.868173
Explore rate: 0.406714
Learning rate: 0.406714
Streaks: 0


Episode = 98
t = 91
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 52.063932
Explore rate: 0.406714
Learning rate: 0.406714
Streaks: 0


Episode = 98
t = 92
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 52.063932
Explore rate: 0.406714
Learning rate: 0.406714
Streaks: 0


Episode = 98
t = 93
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 49.837890
Explore rate: 0.406714
Learning rate: 0.4067


Episode = 100
t = 14
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 51.674157
Explore rate: 0.397940
Learning rate: 0.397940
Streaks: 0


Episode = 100
t = 15
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 52.565999
Explore rate: 0.397940
Learning rate: 0.397940
Streaks: 0


Episode = 100
t = 16
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 52.217815
Explore rate: 0.397940
Learning rate: 0.397940
Streaks: 0


Episode = 100
t = 17
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 52.617587
Explore rate: 0.397940
Learning rate: 0.397940
Streaks: 0


Episode = 100
t = 18
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 52.565454
Explore rate: 0.397940
Learning rate: 0.397940
Streaks: 0


Episode = 100
t = 19
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 52.785602
Explore rate: 0.397940
Learning rate: 0.397940
Streaks: 0


Episode = 100
t = 20
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 52.840945
Explore rate: 0.397940
Learning rate:


Episode = 102
t = 14
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 53.902491
Explore rate: 0.389340
Learning rate: 0.389340
Streaks: 0


Episode = 102
t = 15
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 54.081967
Explore rate: 0.389340
Learning rate: 0.389340
Streaks: 0


Episode = 102
t = 16
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 53.236390
Explore rate: 0.389340
Learning rate: 0.389340
Streaks: 0


Episode = 102
t = 17
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 54.043992
Explore rate: 0.389340
Learning rate: 0.389340
Streaks: 0


Episode = 102
t = 18
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 53.729747
Explore rate: 0.389340
Learning rate: 0.389340
Streaks: 0


Episode = 102
t = 19
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 54.101792
Explore rate: 0.389340
Learning rate: 0.389340
Streaks: 0


Episode = 102
t = 20
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 54.053299
Explore rate: 0.389340
Learning rate:


Episode = 103
t = 11
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 22.428897
Explore rate: 0.385103
Learning rate: 0.385103
Streaks: 0


Episode = 103
t = 12
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 22.428897
Explore rate: 0.385103
Learning rate: 0.385103
Streaks: 0

Episode 103 finished after 12.000000 time steps

Episode = 104
t = 0
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 54.128151
Explore rate: 0.380907
Learning rate: 0.380907
Streaks: 0


Episode = 104
t = 1
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 54.641588
Explore rate: 0.380907
Learning rate: 0.380907
Streaks: 0


Episode = 104
t = 2
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 54.604669
Explore rate: 0.380907
Learning rate: 0.380907
Streaks: 0


Episode = 104
t = 3
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 54.800439
Explore rate: 0.380907
Learning rate: 0.380907
Streaks: 0


Episode = 104
t = 4
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 54.4


Episode = 107
t = 8
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 53.039422
Explore rate: 0.368556
Learning rate: 0.368556
Streaks: 0


Episode = 107
t = 9
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 51.231291
Explore rate: 0.368556
Learning rate: 0.368556
Streaks: 0


Episode = 107
t = 10
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 51.231291
Explore rate: 0.368556
Learning rate: 0.368556
Streaks: 0


Episode = 107
t = 11
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 51.411031
Explore rate: 0.368556
Learning rate: 0.368556
Streaks: 0


Episode = 107
t = 12
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 51.590109
Explore rate: 0.368556
Learning rate: 0.368556
Streaks: 0


Episode = 107
t = 13
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 51.768526
Explore rate: 0.368556
Learning rate: 0.368556
Streaks: 0


Episode = 107
t = 14
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 51.946287
Explore rate: 0.368556
Learning rate: 0


Episode = 108
t = 56
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 49.279472
Explore rate: 0.364516
Learning rate: 0.364516
Streaks: 0


Episode = 108
t = 57
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 54.166105
Explore rate: 0.364516
Learning rate: 0.364516
Streaks: 0


Episode = 108
t = 58
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 54.166105
Explore rate: 0.364516
Learning rate: 0.364516
Streaks: 0


Episode = 108
t = 59
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 54.333177
Explore rate: 0.364516
Learning rate: 0.364516
Streaks: 0


Episode = 108
t = 60
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 54.466309
Explore rate: 0.364516
Learning rate: 0.364516
Streaks: 0


Episode = 108
t = 61
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 54.653468
Explore rate: 0.364516
Learning rate: 0.364516
Streaks: 0


Episode = 108
t = 62
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 54.653468
Explore rate: 0.364516
Learning rate:


Episode = 108
t = 119
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 55.454525
Explore rate: 0.364516
Learning rate: 0.364516
Streaks: 0


Episode = 108
t = 120
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 55.420170
Explore rate: 0.364516
Learning rate: 0.364516
Streaks: 0


Episode = 108
t = 121
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 55.604503
Explore rate: 0.364516
Learning rate: 0.364516
Streaks: 0


Episode = 108
t = 122
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 55.649191
Explore rate: 0.364516
Learning rate: 0.364516
Streaks: 0


Episode = 108
t = 123
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 55.782458
Explore rate: 0.364516
Learning rate: 0.364516
Streaks: 0


Episode = 108
t = 124
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 55.628850
Explore rate: 0.364516
Learning rate: 0.364516
Streaks: 0


Episode = 108
t = 125
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 55.628850
Explore rate: 0.364516
Learnin


Episode = 108
t = 183
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 58.382025
Explore rate: 0.364516
Learning rate: 0.364516
Streaks: 0


Episode = 108
t = 184
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 58.382025
Explore rate: 0.364516
Learning rate: 0.364516
Streaks: 0


Episode = 108
t = 185
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 57.022170
Explore rate: 0.364516
Learning rate: 0.364516
Streaks: 0


Episode = 108
t = 186
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 58.382025
Explore rate: 0.364516
Learning rate: 0.364516
Streaks: 0


Episode = 108
t = 187
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 58.382025
Explore rate: 0.364516
Learning rate: 0.364516
Streaks: 0


Episode = 108
t = 188
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 58.533729
Explore rate: 0.364516
Learning rate: 0.364516
Streaks: 0


Episode = 108
t = 189
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 58.684880
Explore rate: 0.364516
Learnin


Episode = 108
t = 235
Action: 1
State: (0, 0, 5, 2)
Reward: 0.000000
Best Q: 0.000000
Explore rate: 0.364516
Learning rate: 0.364516
Streaks: 36

Episode 108 finished after 235.000000 time steps

Episode = 108
t = 236
Action: 0
State: (0, 0, 5, 2)
Reward: 0.000000
Best Q: 0.000000
Explore rate: 0.364516
Learning rate: 0.364516
Streaks: 37

Episode 108 finished after 236.000000 time steps

Episode = 108
t = 237
Action: 0
State: (0, 0, 5, 2)
Reward: 0.000000
Best Q: 0.000000
Explore rate: 0.364516
Learning rate: 0.364516
Streaks: 38

Episode 108 finished after 237.000000 time steps

Episode = 108
t = 238
Action: 0
State: (0, 0, 5, 2)
Reward: 0.000000
Best Q: 0.000000
Explore rate: 0.364516
Learning rate: 0.364516
Streaks: 39

Episode 108 finished after 238.000000 time steps

Episode = 108
t = 239
Action: 0
State: (0, 0, 5, 2)
Reward: 0.000000
Best Q: 0.000000
Explore rate: 0.364516
Learning rate: 0.364516
Streaks: 40

Episode 108 finished after 239.000000 time steps

Episode = 108
t = 2


Episode = 110
t = 24
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 56.067312
Explore rate: 0.356547
Learning rate: 0.356547
Streaks: 0


Episode = 110
t = 25
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 58.808474
Explore rate: 0.356547
Learning rate: 0.356547
Streaks: 0


Episode = 110
t = 26
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 58.808474
Explore rate: 0.356547
Learning rate: 0.356547
Streaks: 0


Episode = 110
t = 27
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 58.955341
Explore rate: 0.356547
Learning rate: 0.356547
Streaks: 0


Episode = 110
t = 28
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 58.955341
Explore rate: 0.356547
Learning rate: 0.356547
Streaks: 0


Episode = 110
t = 29
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 59.101685
Explore rate: 0.356547
Learning rate: 0.356547
Streaks: 0


Episode = 110
t = 30
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 57.924799
Explore rate: 0.356547
Learning rate:


Episode = 110
t = 89
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 57.682151
Explore rate: 0.356547
Learning rate: 0.356547
Streaks: 0


Episode = 110
t = 90
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 57.833035
Explore rate: 0.356547
Learning rate: 0.356547
Streaks: 0


Episode = 110
t = 91
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 57.833035
Explore rate: 0.356547
Learning rate: 0.356547
Streaks: 0


Episode = 110
t = 92
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 57.547267
Explore rate: 0.356547
Learning rate: 0.356547
Streaks: 0


Episode = 110
t = 93
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 57.979249
Explore rate: 0.356547
Learning rate: 0.356547
Streaks: 0


Episode = 110
t = 94
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 57.851113
Explore rate: 0.356547
Learning rate: 0.356547
Streaks: 0


Episode = 110
t = 95
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 57.851113
Explore rate: 0.356547
Learning rate:


Episode = 112
t = 14
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 59.770049
Explore rate: 0.348722
Learning rate: 0.348722
Streaks: 0


Episode = 112
t = 15
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 59.770049
Explore rate: 0.348722
Learning rate: 0.348722
Streaks: 0


Episode = 112
t = 16
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 59.910339
Explore rate: 0.348722
Learning rate: 0.348722
Streaks: 0


Episode = 112
t = 17
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 59.910339
Explore rate: 0.348722
Learning rate: 0.348722
Streaks: 0


Episode = 112
t = 18
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 60.050141
Explore rate: 0.348722
Learning rate: 0.348722
Streaks: 0


Episode = 112
t = 19
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 60.050141
Explore rate: 0.348722
Learning rate: 0.348722
Streaks: 0


Episode = 112
t = 20
Action: 1
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 22.921629
Explore rate: 0.348722
Learning rate:


Episode = 114
t = 42
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 56.903235
Explore rate: 0.341035
Learning rate: 0.341035
Streaks: 0


Episode = 114
t = 43
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 55.349874
Explore rate: 0.341035
Learning rate: 0.341035
Streaks: 0


Episode = 114
t = 44
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 56.903235
Explore rate: 0.341035
Learning rate: 0.341035
Streaks: 0


Episode = 114
t = 45
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 56.903235
Explore rate: 0.341035
Learning rate: 0.341035
Streaks: 0


Episode = 114
t = 46
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 57.050210
Explore rate: 0.341035
Learning rate: 0.341035
Streaks: 0


Episode = 114
t = 47
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 59.555783
Explore rate: 0.341035
Learning rate: 0.341035
Streaks: 0


Episode = 114
t = 48
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 58.139149
Explore rate: 0.341035
Learning rate:


Episode = 114
t = 106
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 59.926616
Explore rate: 0.341035
Learning rate: 0.341035
Streaks: 0


Episode = 114
t = 107
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 59.926616
Explore rate: 0.341035
Learning rate: 0.341035
Streaks: 0


Episode = 114
t = 108
Action: 0
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 59.926616
Explore rate: 0.341035
Learning rate: 0.341035
Streaks: 0


Episode = 114
t = 109
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 60.063280
Explore rate: 0.341035
Learning rate: 0.341035
Streaks: 0


Episode = 114
t = 110
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 58.406787
Explore rate: 0.341035
Learning rate: 0.341035
Streaks: 0


Episode = 114
t = 111
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 61.205725
Explore rate: 0.341035
Learning rate: 0.341035
Streaks: 0


Episode = 114
t = 112
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 61.205725
Explore rate: 0.341035
Learnin


Episode = 114
t = 170
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 61.904342
Explore rate: 0.341035
Learning rate: 0.341035
Streaks: 0


Episode = 114
t = 171
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 59.493625
Explore rate: 0.341035
Learning rate: 0.341035
Streaks: 0


Episode = 114
t = 172
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 61.490831
Explore rate: 0.341035
Learning rate: 0.341035
Streaks: 0


Episode = 114
t = 173
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 61.490831
Explore rate: 0.341035
Learning rate: 0.341035
Streaks: 0


Episode = 114
t = 174
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 61.622161
Explore rate: 0.341035
Learning rate: 0.341035
Streaks: 0


Episode = 114
t = 175
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 61.753043
Explore rate: 0.341035
Learning rate: 0.341035
Streaks: 0


Episode = 114
t = 176
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 61.753043
Explore rate: 0.341035
Learnin


Episode = 114
t = 221
Action: 1
State: (0, 0, 5, 2)
Reward: 0.000000
Best Q: 0.000000
Explore rate: 0.341035
Learning rate: 0.341035
Streaks: 22

Episode 114 finished after 221.000000 time steps

Episode = 114
t = 222
Action: 0
State: (0, 0, 5, 2)
Reward: 0.000000
Best Q: 0.000000
Explore rate: 0.341035
Learning rate: 0.341035
Streaks: 23

Episode 114 finished after 222.000000 time steps

Episode = 114
t = 223
Action: 0
State: (0, 0, 5, 2)
Reward: 0.000000
Best Q: 0.000000
Explore rate: 0.341035
Learning rate: 0.341035
Streaks: 24

Episode 114 finished after 223.000000 time steps

Episode = 114
t = 224
Action: 0
State: (0, 0, 5, 2)
Reward: 0.000000
Best Q: 0.000000
Explore rate: 0.341035
Learning rate: 0.341035
Streaks: 25

Episode 114 finished after 224.000000 time steps

Episode = 114
t = 225
Action: 0
State: (0, 0, 5, 2)
Reward: 0.000000
Best Q: 0.000000
Explore rate: 0.341035
Learning rate: 0.341035
Streaks: 26

Episode 114 finished after 225.000000 time steps

Episode = 114
t = 2


Episode = 116
t = 8
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 60.832415
Explore rate: 0.333482
Learning rate: 0.333482
Streaks: 0


Episode = 116
t = 9
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 18.856296
Explore rate: 0.333482
Learning rate: 0.333482
Streaks: 0


Episode = 116
t = 10
Action: 1
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 18.856296
Explore rate: 0.333482
Learning rate: 0.333482
Streaks: 0

Episode 116 finished after 10.000000 time steps

Episode = 117
t = 0
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 61.853504
Explore rate: 0.329754
Learning rate: 0.329754
Streaks: 0


Episode = 117
t = 1
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 47.191792
Explore rate: 0.329754
Learning rate: 0.329754
Streaks: 0


Episode = 117
t = 2
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 47.191792
Explore rate: 0.329754
Learning rate: 0.329754
Streaks: 0


Episode = 117
t = 3
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 47.36


Episode = 118
t = 51
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 63.112080
Explore rate: 0.326058
Learning rate: 0.326058
Streaks: 0


Episode = 118
t = 52
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 63.232356
Explore rate: 0.326058
Learning rate: 0.326058
Streaks: 0


Episode = 118
t = 53
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 60.887867
Explore rate: 0.326058
Learning rate: 0.326058
Streaks: 0


Episode = 118
t = 54
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 60.887867
Explore rate: 0.326058
Learning rate: 0.326058
Streaks: 0


Episode = 118
t = 55
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 61.015395
Explore rate: 0.326058
Learning rate: 0.326058
Streaks: 0


Episode = 118
t = 56
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 61.142508
Explore rate: 0.326058
Learning rate: 0.326058
Streaks: 0


Episode = 118
t = 57
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 49.867056
Explore rate: 0.326058
Learning rate:


Episode = 118
t = 116
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 64.268129
Explore rate: 0.326058
Learning rate: 0.326058
Streaks: 0


Episode = 118
t = 117
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 63.517267
Explore rate: 0.326058
Learning rate: 0.326058
Streaks: 0


Episode = 118
t = 118
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 63.517267
Explore rate: 0.326058
Learning rate: 0.326058
Streaks: 0


Episode = 118
t = 119
Action: 0
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 63.517267
Explore rate: 0.326058
Learning rate: 0.326058
Streaks: 0


Episode = 118
t = 120
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 63.636222
Explore rate: 0.326058
Learning rate: 0.326058
Streaks: 0


Episode = 118
t = 121
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 63.636222
Explore rate: 0.326058
Learning rate: 0.326058
Streaks: 0


Episode = 118
t = 122
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 63.636222
Explore rate: 0.326058
Learnin


Episode = 122
t = 14
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 63.872970
Explore rate: 0.311580
Learning rate: 0.311580
Streaks: 0


Episode = 122
t = 15
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 64.454493
Explore rate: 0.311580
Learning rate: 0.311580
Streaks: 0


Episode = 122
t = 16
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 64.164913
Explore rate: 0.311580
Learning rate: 0.311580
Streaks: 0


Episode = 122
t = 17
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 64.475921
Explore rate: 0.311580
Learning rate: 0.311580
Streaks: 0


Episode = 122
t = 18
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 64.475921
Explore rate: 0.311580
Learning rate: 0.311580
Streaks: 0


Episode = 122
t = 19
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 64.372503
Explore rate: 0.311580
Learning rate: 0.311580
Streaks: 0


Episode = 122
t = 20
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 64.630904
Explore rate: 0.311580
Learning rate:


Episode = 124
t = 5
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 63.730784
Explore rate: 0.304518
Learning rate: 0.304518
Streaks: 0


Episode = 124
t = 6
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 63.841230
Explore rate: 0.304518
Learning rate: 0.304518
Streaks: 0


Episode = 124
t = 7
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 61.854183
Explore rate: 0.304518
Learning rate: 0.304518
Streaks: 0


Episode = 124
t = 8
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 51.793275
Explore rate: 0.304518
Learning rate: 0.304518
Streaks: 0


Episode = 124
t = 9
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 51.793275
Explore rate: 0.304518
Learning rate: 0.304518
Streaks: 0


Episode = 124
t = 10
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 51.940073
Explore rate: 0.304518
Learning rate: 0.304518
Streaks: 0


Episode = 124
t = 11
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 52.086425
Explore rate: 0.304518
Learning rate: 0.30


Episode = 128
t = 8
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 56.389933
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 0


Episode = 128
t = 9
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 56.389933
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 0


Episode = 128
t = 10
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 56.516721
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 0


Episode = 128
t = 11
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 56.643140
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 0


Episode = 128
t = 12
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 56.643140
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 0


Episode = 128
t = 13
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 56.769191
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 0


Episode = 128
t = 14
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 56.769191
Explore rate: 0.290730
Learning rate: 0


Episode = 128
t = 73
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 60.262476
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 0


Episode = 128
t = 74
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 60.378005
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 0


Episode = 128
t = 75
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 60.493198
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 0


Episode = 128
t = 76
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 62.459178
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 0


Episode = 128
t = 77
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 62.459178
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 0


Episode = 128
t = 78
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 61.255376
Explore rate: 0.290730
Learning rate: 0.290730
Streaks: 0


Episode = 128
t = 79
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 62.568321
Explore rate: 0.290730
Learning rate:


Episode = 130
t = 6
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 61.027965
Explore rate: 0.283997
Learning rate: 0.283997
Streaks: 0


Episode = 130
t = 7
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 60.303396
Explore rate: 0.283997
Learning rate: 0.283997
Streaks: 0


Episode = 130
t = 8
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 61.014174
Explore rate: 0.283997
Learning rate: 0.283997
Streaks: 0


Episode = 130
t = 9
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 60.615973
Explore rate: 0.283997
Learning rate: 0.283997
Streaks: 0


Episode = 130
t = 10
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 61.012935
Explore rate: 0.283997
Learning rate: 0.283997
Streaks: 0


Episode = 130
t = 11
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 61.012935
Explore rate: 0.283997
Learning rate: 0.283997
Streaks: 0


Episode = 130
t = 12
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 60.839431
Explore rate: 0.283997
Learning rate: 0.2


Episode = 130
t = 71
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 60.981170
Explore rate: 0.283997
Learning rate: 0.283997
Streaks: 0


Episode = 130
t = 72
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 61.599321
Explore rate: 0.283997
Learning rate: 0.283997
Streaks: 0


Episode = 130
t = 73
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 61.599321
Explore rate: 0.283997
Learning rate: 0.283997
Streaks: 0


Episode = 130
t = 74
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 61.708378
Explore rate: 0.283997
Learning rate: 0.283997
Streaks: 0


Episode = 130
t = 75
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 61.708378
Explore rate: 0.283997
Learning rate: 0.283997
Streaks: 0


Episode = 130
t = 76
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 61.817125
Explore rate: 0.283997
Learning rate: 0.283997
Streaks: 0


Episode = 130
t = 77
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 61.132869
Explore rate: 0.283997
Learning rate:


Episode = 131
t = 11
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 61.884291
Explore rate: 0.280669
Learning rate: 0.280669
Streaks: 0


Episode = 131
t = 12
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 61.207899
Explore rate: 0.280669
Learning rate: 0.280669
Streaks: 0


Episode = 131
t = 13
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 61.207899
Explore rate: 0.280669
Learning rate: 0.280669
Streaks: 0


Episode = 131
t = 14
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 61.207899
Explore rate: 0.280669
Learning rate: 0.280669
Streaks: 0


Episode = 131
t = 15
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 61.316776
Explore rate: 0.280669
Learning rate: 0.280669
Streaks: 0


Episode = 131
t = 16
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 61.316776
Explore rate: 0.280669
Learning rate: 0.280669
Streaks: 0


Episode = 131
t = 17
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 61.803326
Explore rate: 0.280669
Learning rate:


Episode = 131
t = 76
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 61.162731
Explore rate: 0.280669
Learning rate: 0.280669
Streaks: 0


Episode = 131
t = 77
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 59.603354
Explore rate: 0.280669
Learning rate: 0.280669
Streaks: 0


Episode = 131
t = 78
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 59.603354
Explore rate: 0.280669
Learning rate: 0.280669
Streaks: 0


Episode = 131
t = 79
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 60.367897
Explore rate: 0.280669
Learning rate: 0.280669
Streaks: 0


Episode = 131
t = 80
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 60.367897
Explore rate: 0.280669
Learning rate: 0.280669
Streaks: 0


Episode = 131
t = 81
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 60.367897
Explore rate: 0.280669
Learning rate: 0.280669
Streaks: 0


Episode = 131
t = 82
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 60.367897
Explore rate: 0.280669
Learning rate:


Episode = 134
t = 21
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 20.314644
Explore rate: 0.270835
Learning rate: 0.270835
Streaks: 0


Episode = 134
t = 22
Action: 0
State: (0, 0, 4, 2)
Reward: 1.000000
Best Q: 20.530460
Explore rate: 0.270835
Learning rate: 0.270835
Streaks: 0

Episode 134 finished after 22.000000 time steps

Episode = 135
t = 0
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 62.036776
Explore rate: 0.267606
Learning rate: 0.267606
Streaks: 0


Episode = 135
t = 1
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 62.471737
Explore rate: 0.267606
Learning rate: 0.267606
Streaks: 0


Episode = 135
t = 2
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 62.328008
Explore rate: 0.267606
Learning rate: 0.267606
Streaks: 0


Episode = 135
t = 3
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 62.534087
Explore rate: 0.267606
Learning rate: 0.267606
Streaks: 0


Episode = 135
t = 4
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 62.4


Episode = 136
t = 41
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 58.293936
Explore rate: 0.264401
Learning rate: 0.264401
Streaks: 0


Episode = 136
t = 42
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 57.592017
Explore rate: 0.264401
Learning rate: 0.264401
Streaks: 0


Episode = 136
t = 43
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 57.592017
Explore rate: 0.264401
Learning rate: 0.264401
Streaks: 0


Episode = 136
t = 44
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 58.220475
Explore rate: 0.264401
Learning rate: 0.264401
Streaks: 0


Episode = 136
t = 45
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 58.220475
Explore rate: 0.264401
Learning rate: 0.264401
Streaks: 0


Episode = 136
t = 46
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 58.220475
Explore rate: 0.264401
Learning rate: 0.264401
Streaks: 0


Episode = 136
t = 47
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 57.951128
Explore rate: 0.264401
Learning rate:


Episode = 139
t = 4
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 58.556803
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 0


Episode = 139
t = 5
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 62.575138
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 0


Episode = 139
t = 6
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 59.755300
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 0


Episode = 139
t = 7
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 59.755300
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 0


Episode = 139
t = 8
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 59.755300
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 0


Episode = 139
t = 9
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 61.958884
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 0


Episode = 139
t = 10
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 60.490466
Explore rate: 0.254925
Learning rate: 0.254


Episode = 139
t = 68
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 60.916716
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 0


Episode = 139
t = 69
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 60.916716
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 0


Episode = 139
t = 70
Action: 1
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 60.916716
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 0


Episode = 139
t = 71
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 61.016349
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 0


Episode = 139
t = 72
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 60.436123
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 0


Episode = 139
t = 73
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 57.659789
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 0


Episode = 139
t = 74
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 60.287516
Explore rate: 0.254925
Learning rate:


Episode = 139
t = 132
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 61.077019
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 0


Episode = 139
t = 133
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 61.037953
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 0


Episode = 139
t = 134
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 60.600646
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 0


Episode = 139
t = 135
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 60.600646
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 0


Episode = 139
t = 136
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 60.701084
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 0


Episode = 139
t = 137
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 60.975441
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 0


Episode = 139
t = 138
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 61.240314
Explore rate: 0.254925
Learnin


Episode = 139
t = 197
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 60.865166
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 0


Episode = 139
t = 198
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 61.656972
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 0


Episode = 139
t = 199
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 61.656972
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 0

Episode 139 finished after 199.000000 time steps

Episode = 139
t = 200
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 61.754718
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 1

Episode 139 finished after 200.000000 time steps

Episode = 139
t = 201
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 61.852215
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 2

Episode 139 finished after 201.000000 time steps

Episode = 139
t = 202
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 61.390370
Explore rate: 0.254925
Learnin


Episode = 139
t = 248
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 62.611375
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 49

Episode 139 finished after 248.000000 time steps

Episode = 139
t = 249
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 62.706688
Explore rate: 0.254925
Learning rate: 0.254925
Streaks: 50

Episode 139 finished after 249.000000 time steps

Episode = 140
t = 0
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 62.363180
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 51


Episode = 140
t = 1
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 62.322666
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 51


Episode = 140
t = 2
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 62.801758
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 51


Episode = 140
t = 3
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 62.801758
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 51


Episode = 140
t = 4
A


Episode = 140
t = 62
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 63.230119
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 51


Episode = 140
t = 63
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 63.230119
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 51


Episode = 140
t = 64
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 63.322710
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 51


Episode = 140
t = 65
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 62.867501
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 51


Episode = 140
t = 66
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 62.867501
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 51


Episode = 140
t = 67
Action: 0
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 62.879658
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 51


Episode = 140
t = 68
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 63.403261
Explore rate: 0.251812
Learning


Episode = 140
t = 127
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 64.075742
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 51


Episode = 140
t = 128
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 64.260073
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 51


Episode = 140
t = 129
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 64.260073
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 51


Episode = 140
t = 130
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 64.350071
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 51


Episode = 140
t = 131
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 64.073383
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 51


Episode = 140
t = 132
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 64.438030
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 51


Episode = 140
t = 133
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 63.499200
Explore rate: 0.251812
L


Episode = 140
t = 190
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 64.875001
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 51


Episode = 140
t = 191
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 65.125837
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 51


Episode = 140
t = 192
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 65.025982
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 51


Episode = 140
t = 193
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 65.188761
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 51


Episode = 140
t = 194
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 65.154631
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 51


Episode = 140
t = 195
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 64.922975
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 51


Episode = 140
t = 196
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 64.922975
Explore rate: 0.251812
L


Episode = 140
t = 241
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 65.411147
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 93

Episode 140 finished after 241.000000 time steps

Episode = 140
t = 242
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 65.498246
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 94

Episode 140 finished after 242.000000 time steps

Episode = 140
t = 243
Action: 1
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 65.585126
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 95

Episode 140 finished after 243.000000 time steps

Episode = 140
t = 244
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 65.585126
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 96

Episode 140 finished after 244.000000 time steps

Episode = 140
t = 245
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 65.441851
Explore rate: 0.251812
Learning rate: 0.251812
Streaks: 97

Episode 140 finished after 245.000000 time steps

Episode = 140



Episode = 141
t = 55
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 66.551943
Explore rate: 0.248721
Learning rate: 0.248721
Streaks: 102


Episode = 141
t = 56
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 66.551943
Explore rate: 0.248721
Learning rate: 0.248721
Streaks: 102


Episode = 141
t = 57
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 66.635136
Explore rate: 0.248721
Learning rate: 0.248721
Streaks: 102


Episode = 141
t = 58
Action: 1
State: (0, 0, 2, 0)
Reward: 1.000000
Best Q: 65.695811
Explore rate: 0.248721
Learning rate: 0.248721
Streaks: 102


Episode = 141
t = 59
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 66.580644
Explore rate: 0.248721
Learning rate: 0.248721
Streaks: 102


Episode = 141
t = 60
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 66.580644
Explore rate: 0.248721
Learning rate: 0.248721
Streaks: 102


Episode = 141
t = 61
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 66.663765
Explore rate: 0.248721
Le


Episode = 141
t = 119
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 67.358589
Explore rate: 0.248721
Learning rate: 0.248721
Streaks: 102


Episode = 141
t = 120
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 67.439775
Explore rate: 0.248721
Learning rate: 0.248721
Streaks: 102


Episode = 141
t = 121
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 67.014233
Explore rate: 0.248721
Learning rate: 0.248721
Streaks: 102


Episode = 141
t = 122
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 67.268493
Explore rate: 0.248721
Learning rate: 0.248721
Streaks: 102


Episode = 141
t = 123
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 67.268493
Explore rate: 0.248721
Learning rate: 0.248721
Streaks: 102


Episode = 141
t = 124
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 67.349903
Explore rate: 0.248721
Learning rate: 0.248721
Streaks: 102


Episode = 141
t = 125
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 66.810220
Explore rate: 0.24


Episode = 141
t = 182
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 68.034363
Explore rate: 0.248721
Learning rate: 0.248721
Streaks: 102


Episode = 141
t = 183
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 68.034363
Explore rate: 0.248721
Learning rate: 0.248721
Streaks: 102


Episode = 141
t = 184
Action: 0
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 68.113868
Explore rate: 0.248721
Learning rate: 0.248721
Streaks: 102


Episode = 141
t = 185
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 68.113868
Explore rate: 0.248721
Learning rate: 0.248721
Streaks: 102


Episode = 141
t = 186
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 68.193175
Explore rate: 0.248721
Learning rate: 0.248721
Streaks: 102


Episode = 141
t = 187
Action: 1
State: (0, 0, 3, 0)
Reward: 1.000000
Best Q: 67.844365
Explore rate: 0.248721
Learning rate: 0.248721
Streaks: 102


Episode = 141
t = 188
Action: 0
State: (0, 0, 2, 1)
Reward: 1.000000
Best Q: 68.290762
Explore rate: 0.24


Episode = 141
t = 232
Action: 0
State: (0, 0, 2, 2)
Reward: 1.000000
Best Q: 68.712378
Explore rate: 0.248721
Learning rate: 0.248721
Streaks: 135

Episode 141 finished after 232.000000 time steps

Episode = 141
t = 233
Action: 1
State: (0, 0, 3, 1)
Reward: 1.000000
Best Q: 68.524656
Explore rate: 0.248721
Learning rate: 0.248721
Streaks: 136

Episode 141 finished after 233.000000 time steps

Episode = 141
t = 234
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 67.429376
Explore rate: 0.248721
Learning rate: 0.248721
Streaks: 137

Episode 141 finished after 234.000000 time steps

Episode = 141
t = 235
Action: 0
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 67.429376
Explore rate: 0.248721
Learning rate: 0.248721
Streaks: 138

Episode 141 finished after 235.000000 time steps

Episode = 141
t = 236
Action: 1
State: (0, 0, 3, 2)
Reward: 1.000000
Best Q: 67.429376
Explore rate: 0.248721
Learning rate: 0.248721
Streaks: 139

Episode 141 finished after 236.000000 time steps

Episode =

In [229]:
env.close()

In [230]:
print(q_table)

[[[[[ 0.          0.        ]
    [ 0.          0.        ]
    [ 0.          0.        ]]

   [[30.30637202 33.37867099]
    [ 5.77801176 24.95325648]
    [ 0.          0.        ]]

   [[68.08267828 56.72551123]
    [69.09711054 68.02461963]
    [64.45902239 68.7439736 ]]

   [[68.77855073 65.64067919]
    [68.10945104 68.77116879]
    [61.63491999 67.84094155]]

   [[ 0.          0.        ]
    [21.72750622 18.79018525]
    [21.37016986 19.30913702]]

   [[ 0.          0.        ]
    [ 0.          0.        ]
    [ 0.          0.        ]]]]]
