In [3]:
import numpy as np
import sys
sys.path.append('C:\Program Files\python36\Lib\site-packages')
import gym
from gym.utils import seeding
from gym import Env, spaces
from gym.envs.toy_text import discrete
import copy

In [31]:
# setup an env
class RaceTrackEnv(Env):
    metadata = {'render.modes':['human']}
    
    def __init__(self, track, max_velocity):
        self.noise = 0.0
        self.seed()
        self.track = self.get_track(track)
        self.nS = np.prod(self.track.shape)
        
        self.observation_space = spaces.Discrete(self.nS)
        
        self.start_positions = self.get_start_positions()
        self.position = self._random_start_position()
        self.velocity = np.array((0, 0), dtype=int)
        self.max_velocity = max_velocity
#         self.action_space = spaces.Tuple((spaces.Discrete(3), spaces.Discrete(3)))
        self.action_space = [(1, 0),(-1,0),(0,0), (1,-1),(-1, -1), (0, -1), (1, -1), (1, 1), (1, 0)]
        self.finished = False
        
    
        
    # return a list that contains start positions
    def get_start_positions(self):
        positions = []
        for i in range(self.track.shape[0]):
            for j in range(self.track.shape[1]):
                if self.track[i][j] == -2:
                    positions.append((i, j))
        return positions
    
    def sample_action(self):
        rand_index = self.np_random.choice(range(len(self.action_space)))
        return np.array(self.action_space[rand_index])
    
    # according to state and velocity cal the path
    def step(self, action):
        self.update_velocity(action)
        self.position = self._update_position()
        reward = -1
        if self.finished:
            reward = 0
        return tuple(self.position, self.velocity), reward, self.finished, {}
        
    def _update_position(self):
        for tstep in range(1, self.max_velocity + 1):
            t = tstep / self.max_velocity
            pos = self.position + np.round(self.velocity * t).astype(np.int16)
            # first judge the wall, then finish line
            if self._is_boundary(pos):
                self.position = self._random_start_position()
                self.velocity = np.array((0, 0), dtype=int)
            elif self._is_finished(pos):
                self.velocity = np.array((0, 0), dtype=int)
                self.finished = True
                return pos
        return pos
    
    # state is like (x, y)
    def _is_finished(self, pos):
        x, y = pos
        if self.track[x][y] == -1:
            return True
    
    def _is_boundary(self, pos):
        x, y = pos
        if x >= self.track.shape[0] or x < 0 or y >= self.track.shape[1] or y < 0:
            return True
        elif self.track[x][y] == 1:
            return True
    
    # velocitys' components must be in (0, max_velocity)
    def update_velocity(self, action):
        v1, v2 = self.velocity
        delta_v1, delta_v2 = action
        v1 += delta_v1
        v2 += delta_v2
        v1 = max(min(self.max_velocity, v1), 0)
        v2 = max(min(self.max_velocity, v2), 0)
        self.velocity = np.array((v1, v2), dtype=int)
    
    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]
    
    def reset(self):
        # position, velocity
        self.position = self._random_start_position()
        self.velocity = np.array((0, 0), dtype=int)
        self.finished = False
        return tuple(self.position, self.velocity)
        
    def _random_start_position(self):
        index = self.np_random.choice(range(len(self.start_positions)))
        pos = self.start_positions[index]
        return pos
    
    # store track as numpy array
    # -2 start position
    # -1 end position
    # 1 wall
    # 0 track
    def get_track(self, track):
        shape1 = len(track[0])
        shape0 = len(track)
        mytrack = np.zeros((shape0, shape1))
        for i in range(shape0):
            for j in range(shape1):
                if track[i][j] == 'W':
                    mytrack[i][j] = 1
                elif track[i][j] == 'o':
                    mytrack[i][j] = 0
                elif track[i][j] == '+':
                    mytrack[i][j] = -1
                else:
                    mytrack[i][j] = -2
        mytrack = np.flipud(mytrack)
        return mytrack


In [32]:
env = RaceTrackEnv(track, 5)


In [41]:
env.step(env.sample_action())

In [40]:
env.trace

[array([0, 7]), array([0, 7]), array([1, 7])]

In [38]:
env.action_space

array([1, 0])

In [None]:
def make_policy(env, Q, eplislon=0.1)
    def policy():
        return
    return policy

def monte_carlo_control(env):
    # init Q, policy
    Q = 
    for s in range(env.nS):
        Q[s] = {}
        for a in env.action_space:
        
    
def policy_eval():
    pass

In [29]:
env.start_positions

[(0, 4), (0, 5), (0, 6), (0, 7), (0, 8), (0, 9)]

In [68]:
action = env.sample_action()

In [70]:
env.position

(32, 7)

In [71]:
env.step(action)

In [6]:
track = ['WWWWWWWWWWWWWWWWWW',
          'WWWWooooooooooooo+',
          'WWWoooooooooooooo+',
          'WWWoooooooooooooo+',
          'WWooooooooooooooo+',
          'Woooooooooooooooo+',
          'Woooooooooooooooo+',
          'WooooooooooWWWWWWW',
          'WoooooooooWWWWWWWW',
          'WoooooooooWWWWWWWW',
          'WoooooooooWWWWWWWW',
          'WoooooooooWWWWWWWW',
          'WoooooooooWWWWWWWW',
          'WoooooooooWWWWWWWW',
          'WoooooooooWWWWWWWW',
          'WWooooooooWWWWWWWW',
          'WWooooooooWWWWWWWW',
          'WWooooooooWWWWWWWW',
          'WWooooooooWWWWWWWW',
          'WWooooooooWWWWWWWW',
          'WWooooooooWWWWWWWW',
          'WWooooooooWWWWWWWW',
          'WWooooooooWWWWWWWW',
          'WWWoooooooWWWWWWWW',
          'WWWoooooooWWWWWWWW',
          'WWWoooooooWWWWWWWW',
          'WWWoooooooWWWWWWWW',
          'WWWoooooooWWWWWWWW',
          'WWWoooooooWWWWWWWW',
          'WWWoooooooWWWWWWWW',
          'WWWWooooooWWWWWWWW',
          'WWWWooooooWWWWWWWW',
          'WWWW------WWWWWWWW']