In [1]:
# import libraries
import gym
import numpy as np
import matplotlib.pyplot as plt
import time

In [2]:
# Hyperparameters
ALPHA = 0.25
GAMMA = 0.9
NUM_EPISODES = 500


# values ranging from -1.2 to 0.6, altered by 0.1
Q_VALUE_POSITIONS = 19

# values ranging from -0.07 to 0.07, altered by 0.01
Q_VALUE_VELOCITY = 15

# 3 actions (0, 1 nad 2) available
# it can either accelerate forwards, accelerate backwards, or do nothing
Q_VALUE_ACTIONS = 3

In [3]:
# create and initialize Q-values tensor
q_values = np.zeros((Q_VALUE_POSITIONS, Q_VALUE_VELOCITY, Q_VALUE_ACTIONS))

In [4]:
def get_values(raw_position, raw_velocity):
    # round values to one/two digits after comma
    rounded_position = round(raw_position, 1)
    rounded_velocity = round(raw_velocity, 2)

    # tuen the rounded values into index numbers
    position = (rounded_position * np.power(10, 1) + 12)
    velocity = (rounded_velocity * np.power(10, 2) + 7)

    return int(position), int(velocity)

In [5]:
def get_action(state):
    # current state
    position, velocity = state
    
    # get the index of position and speed
    position_index, velocity_index = get_values(position, velocity)
    
    # the best actions from Q_table
    best_action = np.argmax(q_values[position_index, velocity_index, :])
    
    return best_action

In [6]:
def update_q_values(state, action, next_state, reward):
    # current state
    position, velocity = state
    
    # get the index of position and speed
    position_index, velocity_index = get_values(position, velocity)
    
    # next state
    next_position, next_velocity = next_state
    
    # get the index of position and speed
    next_position_index, next_velocity_index = get_values(next_position, next_velocity)
    # get the max value in the Q_table for the next state
    max_q_for_next_state = np.max(q_values[next_position_index, next_velocity_index, :])
    # update Q values
    q_values[position_index, velocity_index, action] = (1 - ALPHA) * q_values[position_index, velocity_index, action] + ALPHA * (reward + GAMMA * max_q_for_next_state)

In [7]:
# max number of steps
steps_num = 500

In [8]:
def run_render(steps_num):

    env = gym.make("MountainCar-v0")
    env._max_episode_steps = 501
    for i in range(NUM_EPISODES):
        # get initial state
        state = env.reset()
        # set done to false
        done = False
        # iterate over the steps
        for step in range(steps_num):
            # render 
            env.render()

            # take an action
            action = get_action(state)
            # get the next state based on the action
            next_state, reward, done, info = env.step(action)
            # update q_values
            update_q_values(state, action, next_state, reward)
            # go to next state
            state = next_state
            
            # if the stopping cond is satisfied, break the loop
            if done:
                # sleep for 5 secs so i can take a screenshot 
                time.sleep(5)
                # close the window
                env.close()

                print ("episode", i+1 ,"finished after ", step+1, "steps.")
                print('number of episodes: ', i+1,'\nnumber of steps: ', step+1 )
                return 

        print ("episode", i+1 ,"finished after ", step+1, "steps.")
        
        
    
    return 

In [9]:
# run 
run_render(steps_num)

episode 1 finished after  500 steps.
episode 2 finished after  500 steps.
episode 3 finished after  500 steps.
episode 4 finished after  500 steps.
episode 5 finished after  500 steps.
episode 6 finished after  500 steps.
episode 7 finished after  500 steps.
episode 8 finished after  500 steps.
episode 9 finished after  500 steps.
episode 10 finished after  500 steps.
episode 11 finished after  500 steps.
episode 12 finished after  500 steps.
episode 13 finished after  500 steps.
episode 14 finished after  500 steps.
episode 15 finished after  500 steps.
episode 16 finished after  500 steps.
episode 17 finished after  500 steps.
episode 18 finished after  500 steps.
episode 19 finished after  500 steps.
episode 20 finished after  500 steps.
episode 21 finished after  500 steps.
episode 22 finished after  500 steps.
episode 23 finished after  500 steps.
episode 24 finished after  500 steps.
episode 25 finished after  500 steps.
episode 26 finished after  500 steps.
episode 27 finished a