In [74]:
from collections import defaultdict
from typing import Tuple

import numpy as np
import numpy.typing as npt

## Environment Encoding

We'll encode the CSV with the following states.
- 0 is on the track.
- 1 is out of bounds.
- 2 is finish line.
- 3 is starting line.

In [101]:
class Environment():
    def __init__(self, path: str) -> None:
        self.racetrack: npt.NDArray = np.loadtxt(path, delimiter=",")
        self.reward: int = -1
        self.starts: npt.NDArray = np.argwhere(self.racetrack == 3)
        self.actions: npt.NDArray = np.stack(np.meshgrid([-1, 0, 1], [-1, 0, 1])).reshape(-1, 2)
        self.reset()

    def reset(self) -> None:
        self.state: npt.NDArray = self.starts[np.random.choice(self.starts.shape[0]), :].copy()
        self.velocity: npt.NDArray = np.zeros(2, dtype=np.int32)

    def step(self, action: npt.NDArray, randomness: int = 0.1) -> Tuple[npt.NDArray, npt.NDArray, bool]:
        done = False

        # Don't change velocity with some randomness.
        if np.random.rand() >= randomness:
            self.velocity += action
        self.state += self.velocity

        # Constrain to within racetrack array.
        self.state.clip(0, np.array(self.racetrack.shape) - 1, out=self.state)

        # State checks. Ignore starting state as it's only used for initialization.
        if self.racetrack[tuple(self.state)] == 1:
            self.reset()
        elif self.racetrack[tuple(self.state)] == 2:
            done = True

        return self.state, self.reward, done


In [102]:
class Agent():
    def __init__(self, env: Environment, num_iterations: int = 1000, epsilon: float = 0.1, gamma: float = 1, max_steps = 100) -> None:
        self.env: Environment = env
        self.num_iterations: int = num_iterations
        self.epsilon: float = epsilon
        self.gamma: float = gamma
        self.max_steps: int = max_steps
        self.Q: dict = defaultdict(float)
        self.returns: dict = defaultdict(float)
        self.count: dict = defaultdict(float)
        self.training: bool = True

    def print_action(self):
        for i in range(self.env.racetrack.shape[0]):
            line = "|"
            for j in range(self.env.racetrack.shape[1]):
                idx = np.argmax([self.Q[(i, j), tuple(action)] for action in self.env.actions])
                action = self.env.actions[idx]
                line += f"{action[1]:02}, {action[0]:02}|"
            print(f"{line}")

    def simulate_Q(self) -> None:
        self.training = False
        episode = self.generate_episode()
        for state, action, _ in episode:
            print(f"Position: {state[1]}, {state[0]} | Action: {action[1]}, {action[0]}")

    def epsilon_greedy(self, state: npt.NDArray) -> int:
        if np.random.rand() < self.epsilon and self.training:
            return self.env.actions[np.random.choice(self.env.actions.shape[0]), :]
        else:
            idx = np.argmax([self.Q[(tuple(state), tuple(action))] for action in self.env.actions])
            return self.env.actions[idx]
    
    def generate_episode(self) -> list:
        episode = []
        self.env.reset()

        done = False
        for t in range(self.max_steps):
            state = self.env.state.copy()
            action = self.epsilon_greedy(state)
            next_state, reward, done = self.env.step(action)
            episode.append((state, action, reward))
            if done:
                break
        
        return episode

    def every_visit_mc(self) -> None:
        self.training = True
        for i in range(self.num_iterations):
            episode = self.generate_episode()

            returns = [item[2] for item in episode]
            for t, (state, action, _) in enumerate(episode):
                G_t = sum(reward * (self.gamma ** i) for i, reward in enumerate(returns[t:]))
                self.count[(tuple(state), tuple(action))] += 1
                self.Q[(tuple(state), tuple(action))] = self.Q[(tuple(state), tuple(action))] + ((G_t  - self.Q[(tuple(state), tuple(action))]) / self.count[(tuple(state), tuple(action))])


In [103]:
env = Environment("./racetrack.csv")
agent = Agent(env, num_iterations=10000)
agent.every_visit_mc()
agent.print_action()
agent.simulate_Q()

|00, -1|00, -1|00, -1|00, -1|00, -1|00, -1|00, -1|00, -1|00, -1|00, -1|00, -1|00, -1|00, -1|00, -1|00, -1|00, -1|00, -1|00, -1|00, -1|00, -1|
|00, -1|00, -1|00, -1|00, -1|00, -1|00, -1|-1, 01|01, 01|-1, -1|00, 00|-1, -1|00, 00|00, -1|00, -1|00, -1|00, -1|00, -1|00, -1|00, -1|00, -1|
|00, -1|00, -1|00, -1|00, -1|00, -1|01, 00|00, 00|01, 01|-1, 01|00, -1|-1, -1|01, 00|01, 00|00, 00|00, -1|00, -1|00, -1|00, -1|00, -1|00, -1|
|00, -1|00, -1|00, -1|00, -1|00, 00|-1, -1|01, 01|-1, -1|00, -1|00, -1|00, 00|00, -1|-1, -1|01, 01|01, 00|-1, 01|-1, -1|-1, 01|00, -1|00, -1|
|00, -1|00, -1|00, -1|-1, -1|-1, -1|00, -1|00, 00|01, 01|-1, 01|-1, 01|00, -1|01, 01|01, 00|01, 00|-1, 01|01, 01|01, 01|-1, 01|00, -1|00, -1|
|00, -1|00, -1|00, -1|01, 01|-1, -1|00, -1|-1, -1|-1, -1|01, 00|01, 00|01, 00|00, 00|01, 00|00, 00|01, 01|-1, 01|-1, 01|-1, -1|00, -1|00, -1|
|00, -1|00, -1|00, -1|-1, 01|-1, 01|00, -1|00, 00|01, 01|01, 01|00, -1|-1, -1|01, 01|01, 01|-1, 01|01, 00|01, 01|00, 00|00, 00|00, -1|00, -1|
|00, -