In [1]:
%matplotlib inline

import os
import sys

import gym
import matplotlib
import numpy as np

from collections import defaultdict

from racetrack_env import RacetrackEnv, Map

matplotlib.style.use('ggplot')

N_EPISODE = 200000
MAX_STEP = 70
EPSILON = 0.1
SAVE_FILE = 'racetrack-egreedy.sav'

In [2]:
!rm $SAVE_FILE

In [3]:
with open('racetrack_map_4.txt', 'r') as f:
    amap = Map(f.read(), v_mgn=2, h_mgn=2)
vel_info = (
    0, 3,  # vx min/max
    -3, 3   # vy min/max
)
env = RacetrackEnv(amap, vel_info, MAX_STEP)

In [4]:
def make_epsilon_greedy_policy(env, Q, epsilon, nA):
    """
    Creates an epsilon-greedy policy based on a given Q-function and epsilon.
    
    Args:
        Q: A dictionary that maps from state -> action-values.
            Each value is a numpy array of length nA (see below)
        epsilon: The probability to select a random action . float between 0 and 1.
        nA: Number of actions in the environment.
    
    Returns:
        A function that takes the observation as an argument and returns
        the probabilities for each action in the form of a numpy array of length nA.
    
    """
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return env.regulate_probs(observation, A)
    return policy_fn

In [5]:
def mc_control_epsilon_greedy(env, num_episodes, discount_factor=1.0, epsilon=EPSILON):
    """
    Monte Carlo Control using Epsilon-Greedy policies.
    Finds an optimal epsilon-greedy policy.
    
    Args:
        env: OpenAI gym environment.
        num_episodes: Nubmer of episodes to sample.
        discount_factor: Lambda discount factor.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.
    
    Returns:
        A tuple (Q, policy).
        Q is a dictionary mapping state -> action values.
        policy is a function taht takes an observation as an argument and returns
        action probabilities
    """
    
    # Keeps track of sum and count of returns for each state
    # to calculate an average. We could use an array to save all
    # returns (like in the book) but that's memory inefficient.
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)
    
    # The final action-value function.
    # A nested dictionary that maps state -> (action -> action-value).
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    
    # The policy we're following
    policy = make_epsilon_greedy_policy(env, Q, epsilon, env.action_space.n)
    
    for i_episode in range(1, num_episodes + 1):
        # Print out which episode we're on, useful for debugging.
        if i_episode % 1000 == 0:
            print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
            sys.stdout.flush()

        # Generate an episode.
        # An episode is an array of (state, action, reward) tuples
        episode = []
        state = env.reset()
        for t in range(env.max_step):
            probs = policy(state)
            action = np.random.choice(np.arange(len(probs)), p=probs)
            next_state, reward, done, _ = env.step(action)
            # if np.random.randint(1000) == 0:
            #    print("state {}, probs {}, action {}, done {}, reward {}, next_state {}".format(state, probs, action, done, reward, next_state))
            episode.append((state, action, reward))
            if done:
                break
            state = next_state

        # Find all (state, action) pairs we've visited in this episode
        # We convert each state to a tuple so that we can use it as a dict key
        sa_in_episode = set([(tuple(x[0]), x[1]) for x in episode])
        for state, action in sa_in_episode:
            sa_pair = (state, action)
            # Find the first occurance of the (state, action) pair in the episode
            first_occurence_idx = next(i for i,x in enumerate(episode)
                                       if x[0] == state and x[1] == action)
            # Sum up all rewards since the first occurance
            G = sum([x[2] for x in episode[first_occurence_idx:]])
            # Calculate average return for this state over all sampled episodes
            returns_sum[sa_pair] += G
            returns_count[sa_pair] += 1.0
            Q[state][action] = returns_sum[sa_pair] / returns_count[sa_pair]
        
        # The policy is improved implicitly by changing the Q dictionar
    
    return Q, policy

In [6]:
def create_greedy_policy(Q):
    """
    Creates a greedy policy based on Q values.
    
    Args:
        Q: A dictionary that maps from state -> action values
        
    Returns:
        A function that takes an observation as input and returns a vector
        of action probabilities.
    """
    
    def policy_fn(state):
        A = np.zeros_like(Q[state], dtype=float)
        best_action = np.argmax(Q[state])
        A[best_action] = 1.0
        return A
    return policy_fn

In [7]:
if os.path.isfile(SAVE_FILE):
    Q, policy = env.load(SAVE_FILE, create_greedy_policy)
else:
    Q, _ = mc_control_epsilon_greedy(env, num_episodes=N_EPISODE, epsilon=EPSILON)
    env.save(Q, SAVE_FILE)
    policy = create_greedy_policy(Q)    

Episode 200000/200000.

In [8]:
print("\n{}".format(env.score(policy)))


1.0


In [9]:
env.play(policy)

turn 16/70, state (21, 14, 2, 0), action, 1, reward 1, done False
#######################################################################
#######################################################################
#################################################################ffff##
###################################........####################ffffff##
#################################.............################..fffff##
################################...............##############....fff###
###############################.................############......#####
#############################.......#####.......############.....######
#############################.....#########.....############....#######
##s......###################.....###########....###########.....#######
##s........################....#############....###########....########
##s.........##############.....#############....###########....########
########.....###########......###############...###########....#######

KeyboardInterrupt: 

In [11]:
len(Q)

2928