In [2]:
import gym
import math
import numpy as np


In [3]:
class QTableAgentContinuous:

    def __init__(self, 
                 env,
                 resolution,
                 epsilon, learning_rate, discount_value,
                 ):
        
        self.learning_rate = learning_rate
        self.discount_value = discount_value
        self.resolution = resolution
        
        self.n_states = (resolution,) * env.observation_space.shape[0]
        self.n_actions = env.action_space.n
        
        state_mins = env.low 
        state_maxs = env.high
        
        self.steps = self.get_discrete_steps(state_mins, state_maxs)
        self.mins = env.low
        self.maxs = env.high
        
        self.epsilon = epsilon
        self.initialize_q_table()
     
        
        

    def get_discrete_steps(self, state_mins, state_maxs):
        # splits the continous range into discrete float "steps" - divided by the resolution
        # needed to map continous states into discrete ones later
        return tuple(
            ((state_maxs[i] - state_mins[i]) / resolution)
            for i in range(len(self.n_states)))
        
    
    def get_discrete_states(self, states):
        # computes how many discrete "steps" fit into continuous value
        # steps are as accurate as the set resolution
        # we are taking away the min to normalize the values to 0
        steps = tuple(int((states[i] - self.mins[i]) // self.steps[i]) for i in range(len(states)))
        return steps
        
    
    # This is the policy
    def choose_best_action(self, states):
        states = self.get_discrete_states(states)
        '''parameters: state 
        returns: action'''
        # Always need choose_action
        return np.argmax(self.q_table[states]) # <-- returns the index with the highest action value for that state
    
    def choose_action(self, states):
        if np.random.rand(1) < self.epsilon:
            action = np.random.randint(0, self.n_actions)
        else:
            action = self.choose_best_action(states)
            
        return action
    
        
    def initialize_q_table(self):
        self.q_table = np.zeros((*self.n_states, self.n_actions))
        
       
    def bellman_equation(self, states, action, next_states, reward):
        current_q = self.q_table[states][action]
        max_q_next_states = np.max(self.q_table[next_states])
        
        # This is the Bellman right here!
        new_q =  current_q + self.learning_rate * (reward + (self.discount_value * max_q_next_states) - current_q)
        # single step forecasting?
        # value of state = current value + immediate reward + negative or positive expected reward for that state
        # reward - immediate reward for moving to next_state? 0 for everything except the goal node (1)
        # max_q_next_state - max value of state for all of its actions?
        return new_q
    
        
    
    def update_q_table(self, states, action, next_states, reward):
        
        states = self.get_discrete_states(states)
        next_states = self.get_discrete_states(next_states)
        
        new_q = self.bellman_equation(states, action, next_states, reward)
        self.q_table[states][action] = new_q
        

In [4]:
episodes = 3000

start_epsilon = 1
min_epsilon = 0.1
epsilon_reduction = 0.05
epsilon_decay = 100
resolution = 20

env_name = 'MountainCar-v0'

env = gym.make(env_name, render_mode="human")
env.reset()
env.render()


learning_rate = 0.1
discount_rate = 0.95

render_decay = 1000

seed = 42
np.random.seed(seed)

agent = QTableAgentContinuous(env, resolution, start_epsilon, learning_rate, discount_rate)

for episode in range(episodes):
    
    if (episode + 1) % render_decay == 0:
        env=gym.make(env_name, render_mode="human")
    else:
        env=gym.make(env_name)
        
    if (episode + 1) % epsilon_decay == 0 and (agent.epsilon - epsilon_reduction) > min_epsilon:
        agent.epsilon -= epsilon_reduction
    
    state, _ = env.reset()
    terminated, truncated = False, False
    
    while not terminated and not truncated:
        action = agent.choose_action(state)
    
        new_state, reward, terminated, truncated, info = env.step(action)
        agent.update_q_table(state, action, new_state, reward)
            
        state = new_state
        
    # reset epsilon value for deterministic outputs
    agent.epsilon = 0
            

  if not isinstance(terminated, (bool, np.bool8)):


In [9]:
# Once the agent is trained, see it perform using its best action 

env=gym.make(env_name, render_mode="human")
env.reset
state, _ = env.reset()
terminated, truncated = False, False
    
while not terminated and not truncated:
        action = agent.choose_best_action(state)
    
        new_state, reward, terminated, truncated, info = env.step(action)
            
        state = new_state
    